mirror of /home/gitosis/repositories/libowfat.git
Browse Source
add some single char escaping routines to fmt.h pull in html5 entities from w3c and use those to do a proper scan_html decoding fix an off-by-one in fmt_to_array add a ton of unit tests for the fmt routinesmaster

26 changed files with 2759 additions and 24 deletions
@ -0,0 +1,228 @@
|
||||
#include <stdio.h> |
||||
#include <ctype.h> |
||||
#include <string.h> |
||||
#include <stdlib.h> |
||||
#include "scan.h" |
||||
#include <assert.h> |
||||
|
||||
#include "scan/scan_ulong.c" |
||||
#include "scan/scan_ulongn.c" |
||||
#include "fmt/fmt_utf8.c" |
||||
#include "fmt/fmt_escapecharc.c" |
||||
|
||||
char tmp[20]; |
||||
char tmp2[20]; |
||||
size_t n,m; |
||||
unsigned long l; |
||||
|
||||
struct entity { |
||||
const char* entity; |
||||
char utf8[10]; |
||||
struct entity* next; |
||||
}* root,** cur=&root; |
||||
|
||||
struct letter { |
||||
char c; |
||||
struct letters* weiter; |
||||
uint32_t marshaled; // lower 8 bits: char. rest: ofs from start of marshaled blob
|
||||
}; |
||||
|
||||
struct letters { |
||||
size_t n; |
||||
struct letter liste[256]; |
||||
}; |
||||
|
||||
struct letters* d; |
||||
size_t nodes,datasize; |
||||
|
||||
void addword(struct letters** s,const char* t, void* pointer) { |
||||
size_t i; |
||||
if (!*s) { |
||||
*s=malloc(sizeof(**s)); |
||||
memset(*s,0,sizeof(**s)); |
||||
(*s)->liste[0].c='?'; |
||||
} |
||||
i=(unsigned char)*t; |
||||
if ((*s)->liste[i].c==*t) { |
||||
if (!*t) { |
||||
datasize+=strlen((char*)pointer)+1; |
||||
(*s)->liste[i].weiter=pointer; |
||||
} else |
||||
addword(&(*s)->liste[i].weiter,t+1,pointer); |
||||
return; |
||||
} |
||||
|
||||
++nodes; |
||||
(*s)->n++; |
||||
(*s)->liste[i].c=*t; |
||||
if (!*t) { |
||||
datasize+=strlen((char*)pointer)+1; |
||||
(*s)->liste[i].weiter=pointer; |
||||
} else { |
||||
(*s)->liste[i].weiter=0; |
||||
addword(&(*s)->liste[i].weiter,t+1,pointer); |
||||
} |
||||
} |
||||
|
||||
void dump(struct letters* s,size_t depth) { |
||||
size_t i,j; |
||||
if (!s) return; |
||||
for (i=0; i<256; ++i) { |
||||
if (s->liste[i].c!=i) continue; |
||||
for (j=0; j<depth; ++j) printf(" "); |
||||
printf("'%c' -> {\n",s->liste[i].c); |
||||
if (s->liste[i].c) |
||||
dump(s->liste[i].weiter,depth+1); |
||||
for (j=0; j<depth; ++j) printf(" "); |
||||
printf("}\n"); |
||||
} |
||||
} |
||||
|
||||
size_t used; |
||||
size_t useddata; |
||||
char* heap; |
||||
uint32_t* marshaled; |
||||
char* data; |
||||
|
||||
size_t marshalhelper(struct letters* s) { |
||||
size_t i; |
||||
uint32_t myindex=used; |
||||
if (!s) return 0; |
||||
used+=s->n; |
||||
assert(used<nodes+2); |
||||
for (i=1; i!=0; ++i) { // start at 1, go to 256, then access modulo 256; effect: sort but put 0 last
|
||||
uint32_t x; |
||||
i&=0xff; |
||||
// printf("%c ",i);
|
||||
if (s->liste[i].c!=i) { |
||||
if (i==0) return; |
||||
continue; |
||||
} |
||||
// printf("marshalhelper: %c\n",i);
|
||||
x=(unsigned char)s->liste[i].c; |
||||
if (!x) { |
||||
size_t l=strlen((char*)s->liste[i].weiter)+1; |
||||
// puts((char*)s->liste[i].weiter);
|
||||
x|=useddata<<8; |
||||
assert(useddata+l<=datasize); |
||||
memcpy(data+useddata,s->liste[i].weiter,l); |
||||
useddata+=l; |
||||
marshaled[++myindex]=x; |
||||
return; |
||||
} else { |
||||
x|=(used+1)<<8; |
||||
marshalhelper(s->liste[i].weiter); |
||||
} |
||||
marshaled[++myindex]=x; |
||||
} |
||||
// printf("return\n");
|
||||
} |
||||
|
||||
void marshal(struct letters* s) { |
||||
fprintf(stderr,"nodes=%lu, datasize=%lu\n",nodes,datasize); |
||||
heap=malloc((nodes+1)*sizeof(uint32_t)+datasize); |
||||
if (!heap) return; |
||||
marshaled=(uint32_t*)heap; |
||||
marshaled[0]=nodes+1; |
||||
data=heap+(nodes+1)*sizeof(uint32_t); |
||||
marshalhelper(s); |
||||
fprintf(stderr,"actually used: %lu nodes, %lu bytes data\n",used,useddata); |
||||
} |
||||
|
||||
char* lookup(char* ds,size_t ofs,const char* t) { |
||||
uint32_t* tab=(uint32_t*)ds; |
||||
if (ofs>tab[0]) return 0; |
||||
while (ofs<tab[0]) { |
||||
unsigned char ch=tab[ofs]&0xff; |
||||
if (ch==(unsigned char)*t) { |
||||
if (!ch) |
||||
return ds+tab[0]*sizeof(uint32_t)+(tab[ofs]>>8); |
||||
else |
||||
return lookup(ds,tab[ofs]>>8,t+1); |
||||
} else |
||||
++ofs; |
||||
if (!ch) break; |
||||
} |
||||
return NULL; |
||||
} |
||||
|
||||
int main() { |
||||
FILE* f=fopen("entities.json","r"); |
||||
char buf[256]; |
||||
if (!f) return 1; |
||||
#if 0 |
||||
puts("struct { const char* entity; const char* utf8; } codepoints[] = {"); |
||||
#endif |
||||
while (fgets(buf,sizeof(buf),f)) { |
||||
char* s,* entity; |
||||
size_t ul; |
||||
if (!isspace(buf[0])) continue; |
||||
for (s=buf; *s && *s!='"'; ++s) ; // skip whitespace
|
||||
if (!*s=='"') continue; |
||||
++s; |
||||
entity=s; |
||||
if (*entity!='&') continue; ++entity; ++s; |
||||
for (; *s && *s!='"'; ++s) ; // skip to end of entity
|
||||
if (!*s=='"') continue; |
||||
if (s[-1]!=';') continue; |
||||
s[-1]=0; ++s; |
||||
s=strchr(s,'['); |
||||
if (!s) continue; |
||||
n=0; |
||||
#if 0 |
||||
printf(" { \"%s\", \"",entity); |
||||
#endif |
||||
++s; |
||||
*cur=malloc(sizeof(**cur)); |
||||
(*cur)->next=0; |
||||
if (!((*cur)->entity=strdup(entity))) return 1; |
||||
ul=0; |
||||
do { |
||||
while (isspace(*s)) ++s; |
||||
m=scan_ulong(s,&l); |
||||
if (!m) return 2; |
||||
s+=n; |
||||
n=fmt_utf8(tmp,l); |
||||
if (ul+n>sizeof((*cur)->utf8)) return 3; |
||||
memcpy((*cur)->utf8+ul,tmp,n); |
||||
ul+=n; |
||||
#if 0 |
||||
{ |
||||
size_t i; |
||||
for (i=0; i<n; ++i) { |
||||
fwrite(tmp2,fmt_escapecharc(tmp2,(unsigned char)tmp[i]),1,stdout); |
||||
} |
||||
} |
||||
#endif |
||||
if (*s==']') break; |
||||
} while (*s==','); |
||||
#if 0 |
||||
puts("\" },"); |
||||
#endif |
||||
addword(&d,(*cur)->entity,(*cur)->utf8); |
||||
} |
||||
fclose(f); |
||||
// dump(d,0);
|
||||
marshal(d); |
||||
{ |
||||
FILE* f=fopen("entities.h","w"); |
||||
size_t i; |
||||
fprintf(f,"struct {\n uint32_t tab[%lu];\n char data[%lu];\n} entities = {\n {",marshaled[0],datasize); |
||||
for (i=0; i<marshaled[0]; ++i) { |
||||
if (i%8 == 0) fprintf(f,"\n "); |
||||
fprintf(f,"0x%x,",marshaled[i]); |
||||
} |
||||
fprintf(f,"\n } , {"); |
||||
for (i=0; i<datasize; ++i) { |
||||
if (i%16 == 0) fprintf(f,"\n "); |
||||
fprintf(f,"0x%x,",data[i]&0xff); |
||||
} |
||||
fprintf(f,"\n }\n};"); |
||||
fclose(f); |
||||
} |
||||
// puts(lookup(heap,1,"zwnj"));
|
||||
#if 0 |
||||
puts("};"); |
||||
#endif |
||||
return 0; |
||||
} |
@ -0,0 +1,36 @@
|
||||
#include "fmt.h" |
||||
|
||||
static void fmt_oct3(char* dest,uint8_t w) { |
||||
dest[2]=(w&7)+'0'; w>>=3; |
||||
dest[1]=(w&7)+'0'; w>>=3; |
||||
dest[0]=(w&7)+'0'; |
||||
} |
||||
|
||||
size_t fmt_escapecharc(char* dest,uint32_t ch) { |
||||
char c; |
||||
if (ch>0xff) return 0; |
||||
switch (ch) { |
||||
case '\a': c='a'; goto doescape; |
||||
case '\b': c='b'; goto doescape; |
||||
case 0x1b: c='e'; goto doescape; |
||||
case '\f': c='f'; goto doescape; |
||||
case '\n': c='n'; goto doescape; |
||||
case '\r': c='r'; goto doescape; |
||||
case '\t': c='t'; goto doescape; |
||||
case '\v': c='v'; goto doescape; |
||||
case '\\': |
||||
c='\\'; |
||||
doescape: |
||||
if (dest) { |
||||
dest[0]='\\'; |
||||
dest[1]=c; |
||||
} |
||||
return 2; |
||||
default: |
||||
if (dest) { |
||||
dest[0]='\\'; |
||||
fmt_oct3(dest+1,ch&0xff); |
||||
} |
||||
return 4; |
||||
} |
||||
} |
@ -0,0 +1,9 @@
|
||||
#ifndef __GNUC__ |
||||
|
||||
#include "fmt.h" |
||||
|
||||
size_t fmt_escapecharhtml(char* dest,uint32_t ch) { |
||||
return fmt_escapecharxml(dest,ch); |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,47 @@
|
||||
#include "fmt.h" |
||||
|
||||
static void fmt_hex4(char* dest,uint16_t w) { |
||||
dest[3]=fmt_tohex(w&0xf); w>>=4; |
||||
dest[2]=fmt_tohex(w&0xf); w>>=4; |
||||
dest[1]=fmt_tohex(w&0xf); w>>=4; |
||||
dest[0]=fmt_tohex(w&0xf); |
||||
} |
||||
|
||||
size_t fmt_escapecharjson(char* dest,uint32_t ch) { |
||||
size_t n; |
||||
switch (ch) { |
||||
case '\b': |
||||
ch='b'; goto simple; |
||||
case '\n': |
||||
ch='n'; goto simple; |
||||
case '\r': |
||||
ch='r'; goto simple; |
||||
case '"': |
||||
case '\\': |
||||
case '/': |
||||
simple: |
||||
if (dest) { |
||||
dest[0]='\\'; |
||||
dest[1]=ch; |
||||
} |
||||
return 2; |
||||
} |
||||
if (ch>0xffff) { |
||||
if (ch>0x10ffff) return 0; // highest representable unicode codepoint
|
||||
if (dest) { |
||||
dest[0]='\\'; |
||||
dest[1]='u'; |
||||
fmt_hex4(dest+2,0xd800 | (((ch-0x10000)>>10)&0x3ff)); |
||||
dest+=6; |
||||
} |
||||
ch=(ch&0x3ff)|0xdc00; |
||||
n=6; |
||||
} else |
||||
n=0; |
||||
if (dest) { |
||||
dest[0]='\\'; |
||||
dest[1]='u'; |
||||
fmt_hex4(dest+2,ch); |
||||
} |
||||
return n+6; |
||||
} |
@ -0,0 +1,11 @@
|
||||
#include "fmt.h" |
||||
|
||||
size_t fmt_escapecharquotedprintable(char* dest,uint32_t ch) { |
||||
if (ch>0xff) return 0; |
||||
if (dest) { |
||||
dest[0]='='; |
||||
dest[2]=fmt_tohex(ch&0xf); ch>>=4; |
||||
dest[1]=fmt_tohex(ch&0xf); |
||||
} |
||||
return 3; |
||||
} |
@ -0,0 +1,10 @@
|
||||
#include "fmt.h" |
||||
|
||||
size_t fmt_escapecharquotedprintableutf8(char* dest,uint32_t ch) { |
||||
char buf[FMT_UTF8]; |
||||
size_t i,o,j=fmt_utf8(buf,ch); |
||||
if (!dest) return j*3; |
||||
for (i=o=0; i<j; ++i) |
||||
o+=fmt_escapecharquotedprintable(dest+o,(unsigned char)buf[i]); |
||||
return o; |
||||
} |
@ -0,0 +1,33 @@
|
||||
#include "fmt.h" |
||||
|
||||
size_t fmt_escapecharxml(char* dest,uint32_t ch) { |
||||
char a[FMT_LONG], b[FMT_XLONG]; |
||||
const char* s; |
||||
size_t i,j; |
||||
switch (ch) { |
||||
case '&': s="&"; goto string; |
||||
case '<': s="<"; goto string; |
||||
case '>': s=">"; goto string; |
||||
case '\'': s="'"; goto string; |
||||
case '"': s="""; goto string; |
||||
default: |
||||
a[i=fmt_ulong(a,ch)]=0; |
||||
b[0]='x'; |
||||
b[j=fmt_xlong(b+1,ch)+1]=0; |
||||
s=a; |
||||
if (i>j) { s=b; i=j; } |
||||
if (dest) { |
||||
dest[0]='&'; |
||||
dest[1]='#'; |
||||
byte_copy(dest+2,i,s); |
||||
dest[i+2]=';'; |
||||
} |
||||
return i+3; |
||||
} |
||||
string: |
||||
return fmt_str(dest,s); |
||||
} |
||||
|
||||
#ifdef __GNUC__ |
||||
size_t fmt_escapecharhtml(char* dest,uint32_t ch) __attribute__((__alias__("fmt_escapecharxml"))); |
||||
#endif |
Loading…
Reference in new issue