#include #include #include #include #include #include #define MAX 1024 static char buf[MAX * 6]; /* UTF-8 needs up to 6 bytes */ static char result[MAX * 6]; /* xml-ification needs up to 6 bytes */ /* The spec says ISO-6937, but many stations get it wrong and use ISO-8859-1. */ char *iso6937_encoding = "ISO6937"; static int encoding_default(char *t, const char **s, const char *d) { strncpy(t, iso6937_encoding, 16); return 0; } static int encoding_fixed(char *t, const char **s, const char *d) { strncpy(t, d, 16); *s += 1; return 0; } static int encoding_variable(char *t, const char **s, const char *d) { int i = ((unsigned char)*s[1] << 8) + (unsigned char)*s[2]; snprintf(t, 16, d, i); *s += 3; return 0; } static int encoding_reserved(char *t, const char **s, const char *d) { fprintf(stderr, "Reserved encoding: %02x\n", *s[0]); return 1; } static const struct encoding { int (*handler)(char *t, const char **s, const char *d); const char *data; } encoding[256] = { [0x00] = {encoding_reserved, NULL}, [0x01] = {encoding_fixed, "ISO-8859-5"}, [0x02] = {encoding_fixed, "ISO-8859-6"}, [0x03] = {encoding_fixed, "ISO-8859-7"}, [0x04] = {encoding_fixed, "ISO-8859-8"}, [0x05] = {encoding_fixed, "ISO-8859-9"}, [0x06] = {encoding_fixed, "ISO-8859-10"}, [0x07] = {encoding_fixed, "ISO-8859-11"}, [0x08] = {encoding_fixed, "ISO-8859-12"}, [0x09] = {encoding_fixed, "ISO-8859-13"}, [0x0A] = {encoding_fixed, "ISO-8859-14"}, [0x0B] = {encoding_fixed, "ISO-8859-15"}, [0x0C] = {encoding_reserved, NULL}, [0x0D] = {encoding_reserved, NULL}, [0x0E] = {encoding_reserved, NULL}, [0x0F] = {encoding_reserved, NULL}, [0x10] = {encoding_variable, "ISO-8859-%d"}, [0x11] = {encoding_fixed, "ISO-10646/UCS2"}, // FIXME: UCS-2 LE/BE ??? [0x12] = {encoding_fixed, "KSC_5601"}, // TODO needs newer iconv [0x13] = {encoding_fixed, "GB_2312-80"}, [0x14] = {encoding_fixed, "BIG5"}, [0x15] = {encoding_fixed, "ISO-10646/UTF8"}, [0x16] = {encoding_reserved, NULL}, [0x17] = {encoding_reserved, NULL}, [0x18] = {encoding_reserved, NULL}, [0x19] = {encoding_reserved, NULL}, [0x1A] = {encoding_reserved, NULL}, [0x1B] = {encoding_reserved, NULL}, [0x1C] = {encoding_reserved, NULL}, [0x1D] = {encoding_reserved, NULL}, [0x1E] = {encoding_reserved, NULL}, [0x1F] = {encoding_reserved, NULL}, [0x20 ... 0xFF] = {encoding_default, NULL}, }; static char cs_old[16]; static iconv_t cd; /* Quote the xml entities in the string passed in. */ char *xmlify(const char *s) { char cs_new[16]; int i = (int)(unsigned char)s[0]; if (encoding[i].handler(cs_new, &s, encoding[i].data)) return ""; if (strncmp(cs_old, cs_new, 16)) { if (cd) { iconv_close(cd); cd = NULL; } // if cd = iconv_open("UTF-8", cs_new); if (cd == (iconv_t)-1) { fprintf(stderr, "iconv_open() failed: %s\n", strerror(errno)); exit(1); } // if strncpy(cs_old, cs_new, 16); } // if char *inbuf = (char *)s; size_t inbytesleft = strlen(s); char *outbuf = (char *)buf; size_t outbytesleft = sizeof(buf); size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); // FIXME: handle errors // Luckiely '&<> are single byte character sequences in UTF-8 and no // other character will have a UTF-8 sequence containing these // patterns. Because the MSB is set in all multi-byte sequences, we can // simply scan for '&<> and don't have to parse UTF-8 sequences. char *b = buf, *r = result; for ( ; b < outbuf; b++) switch (*b) { #if 0 // only needed for attributes case '"': *r++ = '&'; *r++ = 'q'; *r++ = 'u'; *r++ = 'o'; *r++ = 't'; *r++ = ';'; break; #endif case '&': *r++ = '&'; *r++ = 'a'; *r++ = 'm'; *r++ = 'p'; *r++ = ';'; break; case '<': *r++ = '&'; *r++ = 'l'; *r++ = 't'; *r++ = ';'; break; case '>': *r++ = '&'; *r++ = 'g'; *r++ = 't'; *r++ = ';'; break; case 0x0000 ... 0x0008: case 0x000B ... 0x001F: case 0x007F: fprintf(stderr, "Forbidden char %02x\n", *b); default: *r++ = *b; break; } // switch *r = '\0'; return result; } // xmlify #ifdef MAIN int main(int argc, char **argv) { if (argc > 1) printf("%s\n%s\n", argv[1], xmlify(argv[1])); return 0; } // main #endif