Logo Search packages:      
Sourcecode: uni2ascii version File versions  Download package

uni2ascii.c

/* Time-stamp: <2005-12-06 17:19:47 poser>
 *
 * Converts UTF-8 Unicode to pure 7-bit ASCII using any of several
 * different representations. 
 * 
 * Copyright (C) 2004, 2005 William J. Poser (billposer@alum.mit.edu)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 * or go to the web page:  http://www.gnu.org/licenses/gpl.txt.
 */

#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#ifdef HAVE_LIBINTL_H
#include <libintl.h>
#define _(String) gettext(String)
#else
#define _(x) (x)
#endif
#include "unicode.h"
#include "enttbl.h"
#include "utf8error.h"
#include "exitcode.h"
#if defined(__DATE__) && defined(__TIME__)
#define HAVE_DATE_TIME
char compdate[]= "Compiled " __DATE__ " " __TIME__ ;
#else
char compdate[]= "";
#endif
#define MSGSIZE 128

char version[]=PACKAGE_VERSION;
char pgname[]="uni2ascii";

char msg [MSGSIZE];

void
ShowVersion(void){
  fprintf(stderr,"\n%s  %s\n",pgname,version);
#ifdef HAVE_DATE_TIME
  fprintf(stderr,"%s\n",compdate);
#endif
  fprintf(stderr,"Copyright (C) 2004, 2005 William J. Poser\n");
  fprintf(stderr,_("Released under the terms of the GNU General Public License.\n\n"));
}

void
ShowUsage(void){
  fprintf(stderr,_("This program is a filter which converts UTF-8 Unicode\n\
to any of a variety 7-bit ASCII textual representations.\n"));
  fprintf(stderr,_("By default all characters above 0x7F are converted to the specified\n\
format except for newline and the space characters (space, tab\n\
ideographic space, ethiopic word space and ogham space mark).\n\
Options allow conversion of the ASCII characters as well\n\
and for conversion even of newline and space characters.\n"));
  fprintf(stderr,_("Usage: %s [flags]\n"),pgname);
  fprintf(stderr,_("       -h Print this usage message.\n"));
  fprintf(stderr,_("       -v Print version information.\n"));
  fprintf(stderr,_("       -l Use lower-case a-f when generating hex.\n"));
  fprintf(stderr,_("       -n Convert newlines.\n"));
  fprintf(stderr,_("       -p Pure. Convert codepoints below 0x80 too.\n"));
  fprintf(stderr,_("       -q Quiet. Do not chat unnecessarily while working.\n"));
  fprintf(stderr,_("       -s Convert space characters.\n"));
  fprintf(stderr,_("       -w Add a space after each converted item.\n"));
  fprintf(stderr,
        _("       -A Generate hexadecimal numbers with prefix U in angle-brackets(<U00E9>)\n"));
  fprintf(stderr,
        _("       -B Generate backslash-x escaped hexadecimal numbers (\\x00E9)\n"));
  fprintf(stderr,
        _("       -C Generate backslash-x escaped hexadecimal numbers in braces (\\x{00E9})\n"));
  fprintf(stderr,
        _("       -D Generate decimal numeric character references (&#0233;)\n"));
  fprintf(stderr,
        _("       -E Generate hexadecimal with prefix U (U00E9)\n"));
  fprintf(stderr,
        _("       -F Generate hexadecimal with prefix u (u00E9)\n"));
  fprintf(stderr,
        _("       -G Generate hexadecimal in single quotes with prefix X (X\'00E9\')\n"));
  fprintf(stderr,
        _("       -H Generate hexadecimal numeric character references (&#x00E9;)\n"));
  fprintf(stderr,
        _("       -I Generate hexadecimal UTF-8 with each byte's hex preceded by an =-sign (=C3=A9)\n"));
  fprintf(stderr,
        _("       -J Generate hexadecimal UTF-8 with each byte's hex preceded by a %%-sign  (%%C3%%A9)\n"));
  fprintf(stderr,
        _("       -K Generate octal UTF-8 with backslash escapes (\303\251)\n"));
  fprintf(stderr,
        _("       -L Generate \\U-escaped hex outside the BMP, \\u-escaped hex within.\n"));
  fprintf(stderr,
        _("       -P Generate hexadecimal numbers with prefix U+ (U+00E9)\n"));
  fprintf(stderr,
        _("       -Q Generate character entities where possible (&eacute;)\n"));
  fprintf(stderr,
        _("       -R Generate raw hexadecimal numbers (00E9)\n"));
  fprintf(stderr,
        _("       -U Generate \\u-escaped hex (\\u00E9)\n"));
  fprintf(stderr,
        _("       -X Generate standard form hexadecimal numbers (0x00E9)\n"));
  fprintf(stderr,
        _("       -Z <format> Use the specified format\n"));
  fprintf(stderr,_("Report bugs to: billposer@alum.mit.edu\n"));
}

#define HTMLX  0
#define BSLU   1
#define BSLX   2
#define STDX   3
#define RAWX   4
#define BSLXB  5
#define HTMLD  6
#define ABUX   7
#define JUUX   8
#define JuUX   9
#define UPLX   10
#define URLPX8 11
#define URLEX8 12
#define BSEO8  13
#define XQ     14

static char *Formats [] = {
    "&#x%04x;",  /* HTML hex */
    "&#x%04X;",
    "\\u%04x",   /* Python */
    "\\u%04X",   
    "\\x%04x",    /* Tcl hex number as opposed to character code  */ 
    "\\x%04X",
    "0x%04x",    /* Standard hex number notation */
    "0x%04X",   
    "%04x",       /* Raw hex */ 
    "%04X",   
    "\\x{%04x}",
    "\\x{%04X}",
    "&#%04d;",    /* HTML decimal */
    "&#%04d;",    /* HTML decimal - entering this twice is not an error. */
    "<U%04x>", 
    "<U%04X>",
    "U%04x", 
    "U%04X", 
    "u%04x", 
    "u%04X",
    "U+%04x",
    "U+%04X",
    "X\'%04x\'",
    "X\'%04X\'"
  };


int main (int ac, char *av[])
{

  UTF32 c;
  int ch;
  int oc;               /* Command line option flag */
  int UCBytes;
  int infd;
  int FType;
  int UTF8Type = 0;
  unsigned long ByteCnt;
  unsigned long CharCnt;

  short PureP = 0;
  short PreserveNewlinesP = 1;
  short PreserveSpacesP = 1;
  short VerboseP = 1;
  short BMPSplitP =0;
  short HexUpperP = 1;  /* Use X or x in  hex ?  */
  short AddWhitespaceP = 0;
  short UseEntitiesP = 0;

  char *AboveBMPfmt;
  char *WithinBMPfmt;
  char *fmt = NULL;
  char *e;

  extern UTF32 Get_UTF32_From_UTF8 (int,int *);
  extern int optind;
  extern int opterr;
  extern int optopt;

  opterr = 0;
  ByteCnt = 0L;
  CharCnt = 0L;
  FType = STDX;               /* Default output is standard hex */

#ifdef HAVE_SETLOCALE
   setlocale(LC_ALL,"");
#endif
#ifdef HAVE_LIBINTL_H
   bindtextdomain (PACKAGE,LOCALEDIR);
   textdomain (PACKAGE);
#endif

  /* Handle command line arguments */

  while( (oc = getopt(ac,av,"lnpqsABCDEFHIJKLPQRUXhvVwZ:")) != EOF){
    switch(oc){
    case 'l':
      HexUpperP = 0;
      break;
    case 'n':
      PreserveNewlinesP = 0;
      break;
    case 'p':
      PureP =1;
      break;
    case 's':
      PreserveSpacesP = 0;
      break;
    case 'w':
      AddWhitespaceP = 1;
      break;
    case 'A':
      FType = ABUX;
      break;
    case 'B':
      FType = BSLX;
      break;
    case 'C':
      FType = BSLXB;
      break;
    case 'D':
      FType = HTMLD;
      break;
    case 'E':
      FType = JUUX;
      break;
    case 'F':
      FType = JuUX;
      break;
    case 'G':
      FType = XQ;
      break;
    case 'H':
      FType = HTMLX;
      break;
    case 'I':
      UTF8Type = 2;
      break;
    case 'J':
      UTF8Type = 1;
      break;
    case 'K':
      UTF8Type = 3;
      break;
    case 'L':
      FType = BSLU;
      BMPSplitP = 1;
      break;
    case 'P':
      FType = UPLX;
      break;
    case 'Q':
      UseEntitiesP = 1;
      break;
    case 'R':
      FType = RAWX;
      break;
    case 'U':
      FType = BSLU;
      BMPSplitP = 0;
      break;
    case 'X':
      FType = STDX;
      break;
    case 'Z':
      fmt = optarg;
      break;
    case 'h':
      ShowUsage();
      exit(INFO);
      break;                  /* NOTREACHED */
    case 'q':
      VerboseP = 0;
      break;
    case 'v':
      ShowVersion();
      exit(INFO);
      break;                  /* NOTREACHED */
    default:
      fprintf(stderr,_("%1$s: invalid option flag %2$c\n"),pgname,optopt);
      ShowVersion();
      ShowUsage();
      exit(BADOPTION);
    }
  } 
  infd = fileno(stdin);

  if(UseEntitiesP && !(FType == HTMLX || FType == HTMLD)) {
    fprintf(stderr,"Use of character entities must be combined with either\ndecimal or hexadecimal character references.\n");
    exit(BADOPTION);
  }

  if(fmt == NULL) {
    fmt = Formats[(2 * FType) + HexUpperP];
  }

  if(HexUpperP) {
    AboveBMPfmt  = "\\U%08X"; 
    WithinBMPfmt = "\\u%04X";  
  }
  else {
    AboveBMPfmt  = "\\U%08x";
    WithinBMPfmt = "\\u%04x";
  }

  if(UTF8Type) {
    while( (ch = getchar()) != EOF) { 
      if(!PureP && (ch <= 0x7F)) putchar(ch);
      else {
      switch (UTF8Type) {
      case 1:
        printf("%%%02X",ch);
        break;
      case 2:
        printf("=%02X",ch);
        break;
      case 3:
        printf("\\%03o",ch);
        break;
      }
      }
      if(AddWhitespaceP) putchar(' ');
    }
    exit(SUCCESS);
  }

  while ( (c = Get_UTF32_From_UTF8(infd,&UCBytes)) <= UNI_MAX_UTF32){
    ByteCnt+=UCBytes;
    CharCnt++;
    switch (c) {
    case 0x000A:
      if(PreserveNewlinesP) putchar((int)c);
      else printf(fmt,c);
      break;
    case 0x0020:        /* space */
    case 0x0009:        /* tab */
      if(PreserveSpacesP)  putchar((int)c);
      else printf(fmt,c);
      break;
    case 0x1361:        /* ethiopic word space */
    case 0x1680:        /* ogham space */
    case 0x3000:        /* ideographic space */
      if(PreserveSpacesP) putchar(0x0020);
      else printf(fmt,c);
      break;
    default:
      if(!PureP && (c <= 0x7F)) putchar((int)c);
      else {
      if(BMPSplitP) {
        if(c > 0xFFFF) printf(AboveBMPfmt,c);
        else printf(WithinBMPfmt,c);
      }
      else {
        if (UseEntitiesP) {
          if ( (e=LookupEntityForCode(c)) != NULL) printf("&%s;",e);
          else printf(fmt,c);
        }
        else printf(fmt,c);
      }
      if(AddWhitespaceP) putchar(' ');
      }
    }
  }

  switch (c){ 
  case UTF8_NOTENOUGHBYTES:
    fprintf(stderr,_("Truncated UTF-8 sequence encountered at byte %1$lu, character %2$lu.\n"),
          ByteCnt,CharCnt);
    exit(BADRECORD);
    break;
  case UTF8_BADINCODE:
    fprintf(stderr,_("Invalid UTF-8 code encountered at byte %1$lu, character %2$lu.\n"),
          ByteCnt,CharCnt);
    exit(BADRECORD);
    break;
  case UTF8_BADOUTCODE:
    fprintf(stderr,_("Encountered invalid Unicode at byte %1$lu, character %2$lu.\n"),
          ByteCnt,CharCnt);
    exit(BADRECORD);
    break;
  case UTF8_IOERROR:
    snprintf(msg,MSGSIZE-1,_ ("Error reading input at byte %1$lu, character %2$lu.\n"),
           ByteCnt,CharCnt);
    perror(msg);
    exit(IOERROR);
    break;
  default:              /* Normal EOF */
    break;
  }

  if(VerboseP) fprintf(stderr,_("%1$ld tokens consisting of %2$ld bytes converted\n"),CharCnt,ByteCnt);
  exit(SUCCESS);
}


Generated by  Doxygen 1.6.0   Back to index