Browse Source
$ make WERROR=-Werror now builds with -Werror
$ make WERROR=-Werror now builds with -Werror
add some single char escaping routines to fmt.h pull in html5 entities from w3c and use those to do a proper scan_html decoding fix an off-by-one in fmt_to_array add a ton of unit tests for the fmt routinesmaster

26 changed files with 2759 additions and 24 deletions
-
1.cvsignore
-
16GNUmakefile
-
5Makefile
-
228ent.c
-
2233entities.json
-
31fmt.h
-
36fmt/fmt_escapecharc.c
-
9fmt/fmt_escapecharhtml.c
-
47fmt/fmt_escapecharjson.c
-
11fmt/fmt_escapecharquotedprintable.c
-
10fmt/fmt_escapecharquotedprintableutf8.c
-
33fmt/fmt_escapecharxml.c
-
24test/marshal.c
-
7test/textcode.c
-
2textcode/fmt_to_array.c
-
58textcode/scan_html.c
-
4tryalloca.c
-
2trybsdsf.c
-
4tryepoll.c
-
4tryip6.c
-
2tryn2i.c
-
4tryscope.c
-
2trysendfile.c
-
2trysigio.c
-
4trysl.c
-
4trysocket.c
@ -0,0 +1,228 @@ |
|||
#include <stdio.h> |
|||
#include <ctype.h> |
|||
#include <string.h> |
|||
#include <stdlib.h> |
|||
#include "scan.h" |
|||
#include <assert.h> |
|||
|
|||
#include "scan/scan_ulong.c" |
|||
#include "scan/scan_ulongn.c" |
|||
#include "fmt/fmt_utf8.c" |
|||
#include "fmt/fmt_escapecharc.c" |
|||
|
|||
char tmp[20]; |
|||
char tmp2[20]; |
|||
size_t n,m; |
|||
unsigned long l; |
|||
|
|||
struct entity { |
|||
const char* entity; |
|||
char utf8[10]; |
|||
struct entity* next; |
|||
}* root,** cur=&root; |
|||
|
|||
struct letter { |
|||
char c; |
|||
struct letters* weiter; |
|||
uint32_t marshaled; // lower 8 bits: char. rest: ofs from start of marshaled blob |
|||
}; |
|||
|
|||
struct letters { |
|||
size_t n; |
|||
struct letter liste[256]; |
|||
}; |
|||
|
|||
struct letters* d; |
|||
size_t nodes,datasize; |
|||
|
|||
void addword(struct letters** s,const char* t, void* pointer) { |
|||
size_t i; |
|||
if (!*s) { |
|||
*s=malloc(sizeof(**s)); |
|||
memset(*s,0,sizeof(**s)); |
|||
(*s)->liste[0].c='?'; |
|||
} |
|||
i=(unsigned char)*t; |
|||
if ((*s)->liste[i].c==*t) { |
|||
if (!*t) { |
|||
datasize+=strlen((char*)pointer)+1; |
|||
(*s)->liste[i].weiter=pointer; |
|||
} else |
|||
addword(&(*s)->liste[i].weiter,t+1,pointer); |
|||
return; |
|||
} |
|||
|
|||
++nodes; |
|||
(*s)->n++; |
|||
(*s)->liste[i].c=*t; |
|||
if (!*t) { |
|||
datasize+=strlen((char*)pointer)+1; |
|||
(*s)->liste[i].weiter=pointer; |
|||
} else { |
|||
(*s)->liste[i].weiter=0; |
|||
addword(&(*s)->liste[i].weiter,t+1,pointer); |
|||
} |
|||
} |
|||
|
|||
void dump(struct letters* s,size_t depth) { |
|||
size_t i,j; |
|||
if (!s) return; |
|||
for (i=0; i<256; ++i) { |
|||
if (s->liste[i].c!=i) continue; |
|||
for (j=0; j<depth; ++j) printf(" "); |
|||
printf("'%c' -> {\n",s->liste[i].c); |
|||
if (s->liste[i].c) |
|||
dump(s->liste[i].weiter,depth+1); |
|||
for (j=0; j<depth; ++j) printf(" "); |
|||
printf("}\n"); |
|||
} |
|||
} |
|||
|
|||
size_t used; |
|||
size_t useddata; |
|||
char* heap; |
|||
uint32_t* marshaled; |
|||
char* data; |
|||
|
|||
size_t marshalhelper(struct letters* s) { |
|||
size_t i; |
|||
uint32_t myindex=used; |
|||
if (!s) return 0; |
|||
used+=s->n; |
|||
assert(used<nodes+2); |
|||
for (i=1; i!=0; ++i) { // start at 1, go to 256, then access modulo 256; effect: sort but put 0 last |
|||
uint32_t x; |
|||
i&=0xff; |
|||
// printf("%c ",i); |
|||
if (s->liste[i].c!=i) { |
|||
if (i==0) return; |
|||
continue; |
|||
} |
|||
// printf("marshalhelper: %c\n",i); |
|||
x=(unsigned char)s->liste[i].c; |
|||
if (!x) { |
|||
size_t l=strlen((char*)s->liste[i].weiter)+1; |
|||
// puts((char*)s->liste[i].weiter); |
|||
x|=useddata<<8; |
|||
assert(useddata+l<=datasize); |
|||
memcpy(data+useddata,s->liste[i].weiter,l); |
|||
useddata+=l; |
|||
marshaled[++myindex]=x; |
|||
return; |
|||
} else { |
|||
x|=(used+1)<<8; |
|||
marshalhelper(s->liste[i].weiter); |
|||
} |
|||
marshaled[++myindex]=x; |
|||
} |
|||
// printf("return\n"); |
|||
} |
|||
|
|||
void marshal(struct letters* s) { |
|||
fprintf(stderr,"nodes=%lu, datasize=%lu\n",nodes,datasize); |
|||
heap=malloc((nodes+1)*sizeof(uint32_t)+datasize); |
|||
if (!heap) return; |
|||
marshaled=(uint32_t*)heap; |
|||
marshaled[0]=nodes+1; |
|||
data=heap+(nodes+1)*sizeof(uint32_t); |
|||
marshalhelper(s); |
|||
fprintf(stderr,"actually used: %lu nodes, %lu bytes data\n",used,useddata); |
|||
} |
|||
|
|||
char* lookup(char* ds,size_t ofs,const char* t) { |
|||
uint32_t* tab=(uint32_t*)ds; |
|||
if (ofs>tab[0]) return 0; |
|||
while (ofs<tab[0]) { |
|||
unsigned char ch=tab[ofs]&0xff; |
|||
if (ch==(unsigned char)*t) { |
|||
if (!ch) |
|||
return ds+tab[0]*sizeof(uint32_t)+(tab[ofs]>>8); |
|||
else |
|||
return lookup(ds,tab[ofs]>>8,t+1); |
|||
} else |
|||
++ofs; |
|||
if (!ch) break; |
|||
} |
|||
return NULL; |
|||
} |
|||
|
|||
int main() { |
|||
FILE* f=fopen("entities.json","r"); |
|||
char buf[256]; |
|||
if (!f) return 1; |
|||
#if 0 |
|||
puts("struct { const char* entity; const char* utf8; } codepoints[] = {"); |
|||
#endif |
|||
while (fgets(buf,sizeof(buf),f)) { |
|||
char* s,* entity; |
|||
size_t ul; |
|||
if (!isspace(buf[0])) continue; |
|||
for (s=buf; *s && *s!='"'; ++s) ; // skip whitespace |
|||
if (!*s=='"') continue; |
|||
++s; |
|||
entity=s; |
|||
if (*entity!='&') continue; ++entity; ++s; |
|||
for (; *s && *s!='"'; ++s) ; // skip to end of entity |
|||
if (!*s=='"') continue; |
|||
if (s[-1]!=';') continue; |
|||
s[-1]=0; ++s; |
|||
s=strchr(s,'['); |
|||
if (!s) continue; |
|||
n=0; |
|||
#if 0 |
|||
printf(" { \"%s\", \"",entity); |
|||
#endif |
|||
++s; |
|||
*cur=malloc(sizeof(**cur)); |
|||
(*cur)->next=0; |
|||
if (!((*cur)->entity=strdup(entity))) return 1; |
|||
ul=0; |
|||
do { |
|||
while (isspace(*s)) ++s; |
|||
m=scan_ulong(s,&l); |
|||
if (!m) return 2; |
|||
s+=n; |
|||
n=fmt_utf8(tmp,l); |
|||
if (ul+n>sizeof((*cur)->utf8)) return 3; |
|||
memcpy((*cur)->utf8+ul,tmp,n); |
|||
ul+=n; |
|||
#if 0 |
|||
{ |
|||
size_t i; |
|||
for (i=0; i<n; ++i) { |
|||
fwrite(tmp2,fmt_escapecharc(tmp2,(unsigned char)tmp[i]),1,stdout); |
|||
} |
|||
} |
|||
#endif |
|||
if (*s==']') break; |
|||
} while (*s==','); |
|||
#if 0 |
|||
puts("\" },"); |
|||
#endif |
|||
addword(&d,(*cur)->entity,(*cur)->utf8); |
|||
} |
|||
fclose(f); |
|||
// dump(d,0); |
|||
marshal(d); |
|||
{ |
|||
FILE* f=fopen("entities.h","w"); |
|||
size_t i; |
|||
fprintf(f,"struct {\n uint32_t tab[%lu];\n char data[%lu];\n} entities = {\n {",marshaled[0],datasize); |
|||
for (i=0; i<marshaled[0]; ++i) { |
|||
if (i%8 == 0) fprintf(f,"\n "); |
|||
fprintf(f,"0x%x,",marshaled[i]); |
|||
} |
|||
fprintf(f,"\n } , {"); |
|||
for (i=0; i<datasize; ++i) { |
|||
if (i%16 == 0) fprintf(f,"\n "); |
|||
fprintf(f,"0x%x,",data[i]&0xff); |
|||
} |
|||
fprintf(f,"\n }\n};"); |
|||
fclose(f); |
|||
} |
|||
// puts(lookup(heap,1,"zwnj")); |
|||
#if 0 |
|||
puts("};"); |
|||
#endif |
|||
return 0; |
|||
} |
2233
entities.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,36 @@ |
|||
#include "fmt.h" |
|||
|
|||
static void fmt_oct3(char* dest,uint8_t w) { |
|||
dest[2]=(w&7)+'0'; w>>=3; |
|||
dest[1]=(w&7)+'0'; w>>=3; |
|||
dest[0]=(w&7)+'0'; |
|||
} |
|||
|
|||
size_t fmt_escapecharc(char* dest,uint32_t ch) { |
|||
char c; |
|||
if (ch>0xff) return 0; |
|||
switch (ch) { |
|||
case '\a': c='a'; goto doescape; |
|||
case '\b': c='b'; goto doescape; |
|||
case 0x1b: c='e'; goto doescape; |
|||
case '\f': c='f'; goto doescape; |
|||
case '\n': c='n'; goto doescape; |
|||
case '\r': c='r'; goto doescape; |
|||
case '\t': c='t'; goto doescape; |
|||
case '\v': c='v'; goto doescape; |
|||
case '\\': |
|||
c='\\'; |
|||
doescape: |
|||
if (dest) { |
|||
dest[0]='\\'; |
|||
dest[1]=c; |
|||
} |
|||
return 2; |
|||
default: |
|||
if (dest) { |
|||
dest[0]='\\'; |
|||
fmt_oct3(dest+1,ch&0xff); |
|||
} |
|||
return 4; |
|||
} |
|||
} |
@ -0,0 +1,9 @@ |
|||
#ifndef __GNUC__ |
|||
|
|||
#include "fmt.h" |
|||
|
|||
size_t fmt_escapecharhtml(char* dest,uint32_t ch) { |
|||
return fmt_escapecharxml(dest,ch); |
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,47 @@ |
|||
#include "fmt.h" |
|||
|
|||
static void fmt_hex4(char* dest,uint16_t w) { |
|||
dest[3]=fmt_tohex(w&0xf); w>>=4; |
|||
dest[2]=fmt_tohex(w&0xf); w>>=4; |
|||
dest[1]=fmt_tohex(w&0xf); w>>=4; |
|||
dest[0]=fmt_tohex(w&0xf); |
|||
} |
|||
|
|||
size_t fmt_escapecharjson(char* dest,uint32_t ch) { |
|||
size_t n; |
|||
switch (ch) { |
|||
case '\b': |
|||
ch='b'; goto simple; |
|||
case '\n': |
|||
ch='n'; goto simple; |
|||
case '\r': |
|||
ch='r'; goto simple; |
|||
case '"': |
|||
case '\\': |
|||
case '/': |
|||
simple: |
|||
if (dest) { |
|||
dest[0]='\\'; |
|||
dest[1]=ch; |
|||
} |
|||
return 2; |
|||
} |
|||
if (ch>0xffff) { |
|||
if (ch>0x10ffff) return 0; // highest representable unicode codepoint |
|||
if (dest) { |
|||
dest[0]='\\'; |
|||
dest[1]='u'; |
|||
fmt_hex4(dest+2,0xd800 | (((ch-0x10000)>>10)&0x3ff)); |
|||
dest+=6; |
|||
} |
|||
ch=(ch&0x3ff)|0xdc00; |
|||
n=6; |
|||
} else |
|||
n=0; |
|||
if (dest) { |
|||
dest[0]='\\'; |
|||
dest[1]='u'; |
|||
fmt_hex4(dest+2,ch); |
|||
} |
|||
return n+6; |
|||
} |
@ -0,0 +1,11 @@ |
|||
#include "fmt.h" |
|||
|
|||
size_t fmt_escapecharquotedprintable(char* dest,uint32_t ch) { |
|||
if (ch>0xff) return 0; |
|||
if (dest) { |
|||
dest[0]='='; |
|||
dest[2]=fmt_tohex(ch&0xf); ch>>=4; |
|||
dest[1]=fmt_tohex(ch&0xf); |
|||
} |
|||
return 3; |
|||
} |
@ -0,0 +1,10 @@ |
|||
#include "fmt.h" |
|||
|
|||
size_t fmt_escapecharquotedprintableutf8(char* dest,uint32_t ch) { |
|||
char buf[FMT_UTF8]; |
|||
size_t i,o,j=fmt_utf8(buf,ch); |
|||
if (!dest) return j*3; |
|||
for (i=o=0; i<j; ++i) |
|||
o+=fmt_escapecharquotedprintable(dest+o,(unsigned char)buf[i]); |
|||
return o; |
|||
} |
@ -0,0 +1,33 @@ |
|||
#include "fmt.h" |
|||
|
|||
size_t fmt_escapecharxml(char* dest,uint32_t ch) { |
|||
char a[FMT_LONG], b[FMT_XLONG]; |
|||
const char* s; |
|||
size_t i,j; |
|||
switch (ch) { |
|||
case '&': s="&"; goto string; |
|||
case '<': s="<"; goto string; |
|||
case '>': s=">"; goto string; |
|||
case '\'': s="'"; goto string; |
|||
case '"': s="""; goto string; |
|||
default: |
|||
a[i=fmt_ulong(a,ch)]=0; |
|||
b[0]='x'; |
|||
b[j=fmt_xlong(b+1,ch)+1]=0; |
|||
s=a; |
|||
if (i>j) { s=b; i=j; } |
|||
if (dest) { |
|||
dest[0]='&'; |
|||
dest[1]='#'; |
|||
byte_copy(dest+2,i,s); |
|||
dest[i+2]=';'; |
|||
} |
|||
return i+3; |
|||
} |
|||
string: |
|||
return fmt_str(dest,s); |
|||
} |
|||
|
|||
#ifdef __GNUC__ |
|||
size_t fmt_escapecharhtml(char* dest,uint32_t ch) __attribute__((__alias__("fmt_escapecharxml"))); |
|||
#endif |
Write
Preview
Loading…
Cancel
Save
Reference in new issue