Logo Search packages:      
Sourcecode: uni2ascii version File versions  Download package

ascii2uni.c

/* Time-stamp: <2006-07-05 17:38:59 poser>
 *
 * Convert text containing various 7-bit ASCII escapes to UTF-7 Unicode.
 *
 * Copyright (C) 2005-2006 William J. Poser (billposer@alum.mit.edu)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 * or go to the web page:  http://www.gnu.org/licenses/gpl.txt.
 */

#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#ifdef HAVE_LIBINTL_H
#include <libintl.h>
#define _(String) gettext(String)
#else
#define _(x) (x)
#endif
#include "unicode.h"
#include "enttbl.h"
#include "exitcode.h"
#if defined(__DATE__) && defined(__TIME__)
#define HAVE_DATE_TIME
char compdate[]= "Compiled " __DATE__ " " __TIME__ ;
#else
char compdate[]= "";
#endif

char version[]=PACKAGE_VERSION;
char pgname[]="ascii2uni";

#define LBUFSIZE 2048

void
ShowVersion(void){
  fprintf(stderr,"\n%s  %s\n",pgname,version);
#ifdef HAVE_DATE_TIME
  fprintf(stderr,"%s\n",compdate);
#endif
  fprintf(stderr,"Copyright (C) 2005-2006 William J. Poser\n");
  fprintf(stderr,_("Released under the terms of the GNU General Public License.\n\n"));
}

void
ShowUsage(void){
  fprintf(stderr,_("This program is a filter which converts 7-bit ASCII text\n\
containing various representations for non-ASCII characters\nto UTF-8 Unicode.\n"));
  fprintf(stderr,_("Usage: %s [flags]\n"),pgname);
  fprintf(stderr,_("       -h Print this usage message.\n"));
  fprintf(stderr,_("       -q Quiet - don't chat.\n"));
  fprintf(stderr,_("       -v Print version information.\n"));
  fprintf(stderr,_("       -8 Convert only tokens above the ASCII range.\n"));
  fprintf(stderr,_("   Give at most one of the following conversion specifications:\n"));
  fprintf(stderr,
        _("       -A Convert hexadecimal numbers with prefix U in angle-brackets(<U00E9>)\n"));
  fprintf(stderr,
        _("       -B Convert \\x-escaped hexadecimal numbers (\\x00E9)\n"));
  fprintf(stderr,
        _("       -C Convert \\x-escaped hexadecimal numbers in braces (\\x{00E9})\n"));
  fprintf(stderr,
        _("       -D Convert decimal HTML numeric character references (&#0233;)\n"));
  fprintf(stderr,
        _("       -E Convert hexadecimal with prefix U (U00E9)\n"));
  fprintf(stderr,
        _("       -F Convert hexadecimal with prefix u (u00E9)\n"));
  fprintf(stderr,
        _("       -G Convert hexadecimal in single quotes with prefix X (X\'00E9\')\n"));
  fprintf(stderr,
        _("       -H Convert hexadecimal HTML numeric character references (&#x00E9;)\n"));
  fprintf(stderr,
        _("       -I Convert hexadecimal UTF-8 with each byte's hex preceded by an =-sign (=C3=A9)\n\t\tThis is the URI escape format defined by RFC 2396.\n"));
  fprintf(stderr,
        _("       -J Convert hexadecimal UTF-8 with each byte's hex preceded by a %%-sign  (%%C3%%A9).\n\t\tThis is the Quoted Printable format defined by RFC 2045.\n"));
  fprintf(stderr,
        _("       -K Convert octal UTF-8 with each byte escaped by a backslash (\\303\\251)\n"));
  fprintf(stderr,
        _("       -L Convert \\u-escaped hex (\\u00E9) within the BMP (U+0000-U+FFFF),\n\t\t\\U-escaped hex (\\U00010024) outisde it.\n"));
  fprintf(stderr,
        _("       -M Convert hexadecimal SGML numeric character references (\\#x00E9;)\n"));
  fprintf(stderr,
        _("       -N Convert decimal SGML numeric character references (\\#0233;)\n"));
  fprintf(stderr,
        _("       -O Convert octal escapes for the three low bytes in big-endian order (\\000\\000\\351)\n"));
  fprintf(stderr,
        _("       -P Convert hexadecimal numbers with prefix U+ (U+00E9)\n"));
  fprintf(stderr,
        _("       -Q Convert HTML character entities (&eacute;)\n"));
  fprintf(stderr,
        _("       -R Convert raw hexadecimal numbers (00E9)\n"));
  fprintf(stderr,
        _("       -S Convert hexadecimal escapes for the three low bytes in big-endian order (\\x00\\x00\\xE9)\n"));
  fprintf(stderr,
        _("       -T Convert decimal escapes for the three low bytes in big-endian order (\\d000\\d000\\d233)\n"));
  fprintf(stderr,
        _("       -U Convert \\u-escaped hex (\\u00E9)\n"));
  fprintf(stderr,
        _("       -V Convert \\u-escaped decimal (\\u0233)\n"));
  fprintf(stderr,
        _("       -X Convert standard form hexadecimal numbers (0x00E9)\n"));
  fprintf(stderr,
        _("       -Y Convert all three HTML escape types:\n\t\thexadecimal numeric, decimal numeric, and character entity.\n"));
  fprintf(stderr, 
        _("       -Z <format> Convert input using the supplied format.\n"));
  fprintf(stderr,_("Report bugs to: billposer@alum.mit.edu\n"));
}

static char lbuf [LBUFSIZE+1];


/* The length of the longest character entity */
#define MAXENTLEN 8

int main (int ac, char *av[])
{
  char *SplitFormat = "\\%1[uU]%X%n"; /* This is for BMPSplit */

  char *Afmt = "<U%lX>";
  char *Bfmt = "\\x%lX";
  char *Cfmt = "\\x{%lX}";
  char *Dfmt = "&#%ld;"; 
  char *Efmt = "U%lX";
  char *Ffmt = "u%lX";
  char *Gfmt = "X\'%lX\'";
  char *Hfmt = "&#x%lX;"; 
  char *Ifmt = "=%2lX";             /* UTF-8 */
  char *Jfmt = "%%%2lX";            /* UTF-8 */
  char *Kfmt = "\\%3lo";            /* UTF-8 */
  char *Mfmt = "\\#x%lX;"; 
  char *Nfmt = "\\#%ld;";
  char *Ofmt = "\\%03o\\%03o\\%03o";
  char *Pfmt = "U+%lX";
  char *Qfmt = "&%[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789];";
  char *Rfmt = "%lX"; 
  char *Sfmt = "\\x%02x\\x%02x\\x%02x";   
  char *Tfmt = "\\d%03d\\d%03d\\d%03d";
  char *Ufmt = "\\u%8lX";
  char *Vfmt = "\\u%8ld";
  char *Xfmt = "0x%4lX";

  char *fmt = Xfmt;           /* Default is plain hex format */
  char afmt [67+2+1+2];
  char aHfmt [8+2+1];
  char aDfmt [8+2+1];
  char cbuf[5];

  UTF32 num;
  int oc;               /* Command line option flag */
  int Converted;
  long TokenNumber;
  long ReplacedNumber;
  int BMPSplit = 0; 
  int VerboseP = 1;
  int UTF8ValueP = 0;         /* Are incoming values UTF-8? */
  int AllHTMLP = 0;           /* Translate all three kinds of HTML escape */
  int PureP = 0;
  int Word_Length;
  int NConsumed;
  int LineNo;
  char *str;
  char *iptr;
  int eof;
  char SplitStr[3];
  char enam[LBUFSIZE];
  long MicrosoftStyle = 0L;
  unsigned char b1;           /* Used for byte-wise encoding */
  unsigned char b2;
  unsigned char b3;

  extern int optind;
  extern int opterr;
  extern int optopt;
  extern void putu8 (unsigned long);
  extern char * Get_Word(FILE *, int *, int *);

  opterr = 0;

#ifdef HAVE_SETLOCALE
   setlocale(LC_ALL,"");
#endif
#ifdef HAVE_LIBINTL_H
   bindtextdomain (PACKAGE,LOCALEDIR);
   textdomain (PACKAGE);
#endif

  /* Handle command line arguments */

   while( (oc = getopt(ac,av,"ABCDEFHIJKLMNOPQRSTUVXYZ:hpqv")) != EOF){
     switch(oc){
     case 'A':
       fmt = Afmt;
       break;
     case 'X':
       fmt = Xfmt;
       break;
     case 'O':
       fmt = Ofmt;
       break;
     case 'S':
       fmt = Sfmt;
       break;
     case 'T':
       fmt = Tfmt;
       break;
     case 'P':
       fmt = Pfmt;
       break;
     case 'Q':
       fmt = Qfmt;
       break;
     case 'R':
       fmt = Rfmt;
       break;
     case 'B':
       fmt = Bfmt;
       break;
     case 'C':
       fmt = Cfmt;
       break;
     case 'D':
       fmt = Dfmt;
       break;
     case 'E':
       fmt = Efmt;
       break;
     case 'F':
       fmt = Ffmt;
       break;
     case 'G':
       fmt = Gfmt;
       break;
     case 'H':
       fmt = Hfmt;
       break;
     case 'I':
       fmt = Ifmt;
       UTF8ValueP = 1;
       break;
     case 'J':
       fmt = Jfmt;
       UTF8ValueP = 1;
       cbuf[0] = '0';
       cbuf[1] = 'x';
       break;
     case 'K':
       fmt = Kfmt;
       cbuf[0] = '\\';
       UTF8ValueP = 1;
       break;
     case 'L':
       fmt = SplitFormat;
       BMPSplit =1;
       break;
     case 'M':
       fmt = Mfmt;
       break;
     case 'N':
       fmt = Nfmt;
       break;
     case 'U':
       fmt = Ufmt;
       break;
     case 'V':
       fmt = Vfmt;
       break;
     case 'Y':
       fmt = Qfmt;
       AllHTMLP = 1;
       break;
     case 'Z':
       fmt = optarg;
       break;
     case 'p':
       PureP = 1;
       break;
     case 'q':
       VerboseP = 0;
       break;
     case 'h':
       ShowUsage();
       exit(INFO);
       break;                 /* NOTREACHED */
     case 'v':
       ShowVersion();
       exit(INFO);
       break;                 /* NOTREACHED */
     case ':':
       fprintf(stderr,_("%s: missing argument to option flag %c.\n"),pgname,optopt);
       exit(BADOPTIONARG);
     default:
       fprintf(stderr,_("%1$s: invalid option flag %2$c\n"),pgname,optopt);
       ShowVersion();
       ShowUsage();
       exit(INFO);
     }
   }

   if( (fmt == Rfmt) && (!PureP) ) {
     fprintf(stderr,_("It isn't possible to parse raw hex unicode out of ASCII text.\n"));
     exit(BADOPTION);
   }

   if(AllHTMLP && PureP) {
     fprintf(stderr,_("Conversion of all three HTMl formats is not supported in pure mode.\n"));
     exit(BADOPTION);
   }

   if(AllHTMLP) {
     sprintf(aDfmt,"%s%%n",Dfmt);
     sprintf(aHfmt,"%s%%n",Hfmt);
   }

   sprintf(afmt,"%s%%n",fmt); /* Add %n for NConsumed */
   ReplacedNumber = 0L;
   TokenNumber = 0L;
   /*
    * This is the case in which the input consists entirely of escapes
    * except for arbitrary (but non-null) amounts of intervening whitespace.
    */

   if(PureP) {
     while(1){
       str = Get_Word(stdin,&Word_Length,&eof);
       if(eof) break; 
       if(Word_Length == 0) continue;
       TokenNumber++;
       if(str == NULL){
       fprintf(stderr,_("%1$s: failed to allocate storage for input token %2$ld.\n"),
             pgname,TokenNumber);
       exit(OUTOFMEMORY);
       }
       if(fmt == Qfmt) {
       Converted = sscanf(str,afmt,&enam,&NConsumed);
       num = LookupCodeForEntity(enam);
       if(!num) {
         num = UNI_REPLACEMENT_CHAR;
         fprintf(stderr,"ascii2uni: unknown HTML character entity \"&%s;\"\n",
               enam);
         ReplacedNumber++;
         Converted = (-1);
       }
       else Converted = 1;
       }
       else if( (Ofmt == fmt) || (Sfmt == fmt) || (Tfmt == fmt)) {
       Converted = sscanf(str,afmt,&b1,&b2,&b3,&NConsumed);
       switch(Converted)
         {
         case 3:
           num = (((b1 * 256) + b2) * 256) + b3;
           break;
         case 2:
           num = (b1 * 256) + b2;
           break;
         case 1:
           num = b1;
           break;
         default:
           break;
           /* This case is handled below */
       }
       }
       else {
       Converted = sscanf(str,afmt,&num,&NConsumed);
       }

       if(Converted < 1) {
       fprintf(stderr,_("Ill-formed input %1$s at token %2$lu\n"),str,TokenNumber);
       exit(BADRECORD); 
       }
       else if(Converted > 3) {
       fprintf(stderr,_("The character encoded as %1$s at token %2$lu is outside the Unicode range.\n\tEmitting Unicode replacement character.\n"),
             str,TokenNumber);
       putu8(UNI_REPLACEMENT_CHAR);
       } 
       else {
       if (UTF8ValueP) putchar(num);
       else putu8(num);
       if( (fmt == Dfmt) || (fmt == Hfmt) || (fmt == Qfmt)) {
         if(*(str+NConsumed-1) != ';') MicrosoftStyle++;
       }
       }
       free((void *)str);
     }
     goto done;
   }

   /* This is the case in which the Unicode escapes are embedded in ASCII text */

   LineNo = 0;
   while(fgets(lbuf,LBUFSIZE,stdin) != NULL) {
     LineNo++;
     iptr = lbuf;
     if(fmt == Jfmt) {
       while(*iptr) {
       if(*iptr == '%') {
         if(*++iptr) {
           if(isxdigit(*iptr++)) {
             if(*iptr) {
             if(isxdigit(*iptr)) { /* match */
               cbuf[2] = *(iptr-1);
               cbuf[3] = *iptr;
               cbuf[4] = '\0';
   /*          fprintf(stderr,"cbuf = %s\n",cbuf); */
               num = strtoul(cbuf,NULL,16);
               putchar(num);
               TokenNumber++;
               iptr++;
             }
             else {           /* We have % X foo */
               putchar('%');
               putchar(*(iptr-1));
               if(*iptr != '%') putchar(*iptr++);
               continue;
             }
             }
             else {           /* We have % X EOL */
             putchar('%');
             putchar(*(iptr-1));
             putchar('\n');
             break;
             }
           }
           else {             /* We have % foo */
             putchar('%');
             if(*iptr != '%') putchar(*iptr++);
             continue;
           }
         }       
         else {         /* We have % EOL */
           putchar('%');
           putchar('\n');
           break;
         }
       }
       else {
         putchar(*iptr++);
         continue;
       }
       }
     } /* End of special case for J format */

     while (*iptr) { 
       if(BMPSplit) {
       if(sscanf(iptr,SplitFormat,&SplitStr,&num,&NConsumed)) {
         if( (num <= 0xFFFF) && (SplitStr[0] == 'U')) {
           fprintf(stderr,_("Warning: the code \\U%1$08lX at line %2$d falls within the BMP.\n"),
                 num,LineNo);
         }
         if( (num > 0xFFFF) && (SplitStr[0] == 'u')) {
           fprintf(stderr,_("Warning: the code \\u%1$08lX at line %2$d falls outside the BMP.\n"),
                 num,LineNo);
         }
         putu8(num);
         iptr+=NConsumed;
         TokenNumber++;
       }
       else putchar(*iptr++);
       }
       else if (fmt == Qfmt) {
       if (AllHTMLP){
         if(sscanf(iptr,aHfmt,&num,&NConsumed)) {
           putu8(num);
           iptr+=NConsumed;
           if(*(iptr-1) != ';') MicrosoftStyle++;
           TokenNumber++;
           continue;
         }
         else if(sscanf(iptr,aDfmt,&num,&NConsumed)) {
           putu8(num);
           iptr+=NConsumed;
           if(*(iptr-1) != ';') MicrosoftStyle++;
           TokenNumber++;
           continue;
         }
       }
       if(sscanf(iptr,afmt,&enam,&NConsumed)) {
         if( (num = LookupCodeForEntity(enam))) {
           putu8(num);
           iptr+=NConsumed;
           if(*(iptr-1) != ';') MicrosoftStyle++;
           TokenNumber++;
         }
         else {
           fprintf(stderr,"ascii2uni: unknown HTML character entity \"&%s;\" at line %d\n",
                 enam,LineNo);
           putu8(UNI_REPLACEMENT_CHAR);
           iptr+=NConsumed;
           ReplacedNumber++;
         }
       }
       else putchar(*iptr++);
       } /* End of Qfmt case */
       else if( (Ofmt == fmt) || (Sfmt == fmt) || (Tfmt == fmt)) {
       Converted=sscanf(iptr,afmt,&b1,&b2,&b3,&NConsumed);
       switch(Converted)
         {
         case 3:
           num = (((b1 * 256) + b2) * 256) + b3;
           putu8(num);iptr+=NConsumed;
           break;
         case 2:
           num = (b1 * 256) + b2;
           putu8(num);iptr+=NConsumed;
           break;
         case 1:
           num = b1;
           putu8(num);iptr+=NConsumed;
           break;
         case 0:
           putchar(*iptr++);
           break;
         default:
           fprintf(stderr,_("The character encoded as %1$s at token %2$lu is outside the Unicode range.\n\tEmitting Unicode replacement character.\n"),
                 str,TokenNumber);
           putu8(UNI_REPLACEMENT_CHAR);
         }
         TokenNumber++;
       }
       else {                 /* Default - not BMPSplit, HTML, or byte format */
        if(sscanf(iptr,afmt,&num,&NConsumed)) {
         if (UTF8ValueP) putchar(num);
         else putu8(num);
         iptr+=NConsumed;
         if(fmt == Hfmt) {
           if(*(iptr-1) != ';') MicrosoftStyle++;
         }
         else if(fmt == Dfmt) {
           if(*(iptr-1) != ';') MicrosoftStyle++;
         }
         TokenNumber++;
       }
       else putchar(*iptr++);
       }
     } /* Loop over current line */
   } /* Loop over input lines */

done:
   if(VerboseP) {
     fprintf(stderr,_("%ld tokens converted\n"),TokenNumber);
     fprintf(stderr,_("%ld tokens replaced with Unicode Replacement Character\n"),ReplacedNumber);
     if(MicrosoftStyle) {
       fprintf(stderr,
             _("%ld Microsoft-style (lacking final semi-colon)\n"),MicrosoftStyle);
     }
   }
   exit(SUCCESS);
}


Generated by  Doxygen 1.6.0   Back to index