TinTin++ Mud Client The TinTin++ message board

 
 FAQFAQ   SearchSearch   MemberlistMemberlist   UsergroupsUsergroups   RegisterRegister 
 ProfileProfile   Log in to check your private messagesLog in to check your private messages   Log inLog in 
TinTin++ Mud Client

Tighten up on UTF-8 sequences

 
Post new topic   Reply to topic    The TinTin++ message board Forum Index -> Bug Reports
View previous topic :: View next topic  
Author Message
Slysven



Joined: 10 Apr 2011
Posts: 365
Location: As "Jomin al'Bara" in WoTMUD or Wiltshire, UK

PostPosted: Wed Apr 30, 2014 8:11 pm    Post subject: Tighten up on UTF-8 sequences Reply with quote

May I propose the following to make the incoming parser more strict on validating UTF-8 byte sequences, by changing the following in tokenize.c:
Code:
char *get_arg_parse(struct session *ses, struct scriptnode *token)
{
   static char buf[5];

   if (HAS_BIT(ses->flags, SES_FLAG_BIG5) && token->data->arg[0] & 128 && token->data->arg[1] != 0)
   {
      token->data->arg += sprintf(buf, "%c%c", token->data->arg[0], token->data->arg[1]);
   }
   else if (HAS_BIT(ses->flags, SES_FLAG_UTF8) && (token->data->arg[0] & 192) == 192 && token->data->arg[1] != 0)
   {
      if ((token->data->arg[0] & 240) == 240 && token->data->arg[2] != 0 && token->data->arg[3] != 0)
      {
         token->data->arg += sprintf(buf, "%c%c%c%c", token->data->arg[0], token->data->arg[1], token->data->arg[2], token->data->arg[3]);
      }
      else if ((token->data->arg[0] & 224) == 224 && token->data->arg[2] != 0)
      {
         token->data->arg += sprintf(buf, "%c%c%c", token->data->arg[0], token->data->arg[1], token->data->arg[2]);
      }
      else
      {
         token->data->arg += sprintf(buf, "%c%c", token->data->arg[0], token->data->arg[1]);
      }
   }
   else
   {
      token->data->arg += sprintf(buf, "%c", token->data->arg[0]);
   }

   return buf;
}
to
Code:
char *get_arg_parse(struct session *ses, struct scriptnode *token)
{
   static char buf[5];

   if (HAS_BIT(ses->flags, SES_FLAG_BIG5) && token->data->arg[0] & 128 && token->data->arg[1] != 0)
   {
      token->data->arg += sprintf(buf, "%c%c", token->data->arg[0], token->data->arg[1]);
   }
   else if (HAS_BIT(ses->flags, SES_FLAG_UTF8) && (token->data->arg[0] & 192) == 192 && token->data->arg[1] != 0)
   {
      if (strlen(token->data->arg) > 3)
      {
         if ((token->data->arg[0] & 248) == 240 && (token->data->arg[1] & 192) == 128 && (token->data->arg[2] & 192) == 128 && (token->data->arg[3] & 192) == 128)
         {
            token->data->arg += sprintf(buf, "%c%c%c%c", token->data->arg[0], token->data->arg[1], token->data->arg[2], token->data->arg[3]);
         }
         else
         {
            token->data->arg += sprintf(buf, "%c%c", '\x0ff', '\x0fd');
         }
      }
      else if (strlen(token->data->arg) > 2)
      {
         if ((token->data->arg[0] & 240) == 224 && (token->data->arg[1] & 192) == 128 && (token->data->arg[2] & 192) == 128)
         {
            token->data->arg += sprintf(buf, "%c%c%c", token->data->arg[0], token->data->arg[1], token->data->arg[2]);
         }
         else
         {
            token->data->arg += sprintf(buf, "%c%c", '\x0ff', '\x0fd');
         }
      }
      else
      {
         if ((token->data->arg[0] & 224) == 192 && (token->data->arg[1] & 192) == 128)
         {
            token->data->arg += sprintf(buf, "%c%c", token->data->arg[0], token->data->arg[1]);
         }
         else
         {
            token->data->arg += sprintf(buf, "%c%c", '\x0ff', '\x0fd');
         }
      }
   }
   else
   {
      token->data->arg += sprintf(buf, "%c", token->data->arg[0]);
   }

   return buf;
}
This will replace invalid sequences with the replacement character '�' {U+FFFD} if TinTin++ is set to UTF-8 mode and gets something that isn't.

In support of this I note from wikipedia that:
Quote:
...
Overlong encodings

The standard specifies that the correct encoding of a code point use only the minimum number of bytes required to hold the significant bits of the code point. Longer encodings are called overlong and are not valid UTF-8 representations of the code point. This rule maintains a one-to-one correspondence between code points and their valid encodings, so that there is a unique valid encoding for each code point. Allowing multiple encodings would make testing for string equality difficult to define.
...
RFC 3629 states "Implementations of the decoding algorithm MUST protect against decoding invalid sequences." The Unicode Standard requires decoders to "...treat any ill-formed code unit sequence as an error condition. This guarantees that it will neither interpret nor emit an ill-formed code unit sequence."
Back to top
View user's profile Send private message
Scandum
Site Admin


Joined: 03 Dec 2004
Posts: 3796

PostPosted: Thu May 01, 2014 6:50 am    Post subject: Reply with quote

I think that code is invalid if arg holds a 2 byte sequence but is longer than 3 characters.

I also prefer not to change the data, I'd rather let the terminal figure it out.
Back to top
View user's profile Send private message Send e-mail
Display posts from previous:   
Post new topic   Reply to topic    The TinTin++ message board Forum Index -> Bug Reports All times are GMT - 5 Hours
Page 1 of 1

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
Get TinTin++ Mud Client at SourceForge.net. Fast, secure and Free Open Source software downloads Get TinTin++ Mud Client at SourceForge.net. Fast, secure and Free Open Source software downloads
TinTin++ Homepage

Powered by phpBB © 2001, 2002 phpBB Group