1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
161:
162:
163:
164:
165:
166:
167:
168:
169:
170:
171:
172:
173:
174:
175:
176:
177:
178:
179:
180:
181:
182:
183:
184:
185:
186:
187:
188:
189:
190:
191:
192:
193:
194:
195:
196:
197:
198:
199:
200:
201:
202:
203:
204:
205:
206:
207:
208:
209:
210:
211:
212:
213:
214:
215:
216:
217:
218:
219:
220:
221:
222:
223:
224:
225:
226:
227:
228:
229:
230:
231:
232:
233:
234:
235:
236:
237:
238:
239:
240:
241:
242:
243:
244:
245:
246:
247:
248:
249:
250:
251:
252:
253:
254:
255:
256:
257:
258:
259:
260:
261:
262:
263:
264:
265:
266:
267:
268:
269:
270:
271:
272:
273:
274:
275:
276:
277:
278:
279:
280:
281:
282:
283:
284:
285:
286:
287:
288:
289:
290:
291:
292:
293:
294:
295:
296:
297:
298:
299:
300:
301:
302:
303:
304:
305:
306:
307:
308:
309:
310:
311:
312:
313:
314:
315:
316:
317:
318:
319:
320:
321:
322:
323:
324:
325:
326:
327:
328:
329:
330:
331:
332:
333:
334:
335:
|
#include <stdio.h>
#include <string.h>
#include <wchar.h>
#include <stdint.h>
#include <ctype.h>
#include <libxml/tree.h>
#include <libxml/xmlreader.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#define CONST_BUCKETS_COUNT (128*1024*1024)
#define CONST_INIT_WORDLIST_SIZE 1024
#define CONST_INIT_CONTENT_BUFFER_SIZE 1024
typedef struct _Entry Entry;
struct _Entry {
uint64_t wordOffset;
uint32_t count;
Entry *next;
};
typedef struct _WordList WordList;
struct _WordList{
char *words;
size_t wordsLength;
uint64_t wordsPos;
Entry *buckets;
int bucketsCount;
};
typedef struct _Content Content;
struct _Content {
char *buffer;
int len;
size_t allocated_size;
};
typedef struct _parserState ParserState;
struct _parserState{
int level;
int state;
Content content[1];
WordList wordList[1];
};
uint64_t fnFNV(const char* word){
uint64_t nHashVal = 0xcbf29ce484222325ULL,
nMagicPrime = 0x00000100000001b3ULL;
while(word[0]){
nHashVal ^= *word++;
nHashVal *= nMagicPrime;
}
return nHashVal;
}
void startDocumentHandler(ParserState *state){
state->level = 0;
state->state = 0;
state->content->buffer = (char *)malloc(CONST_INIT_CONTENT_BUFFER_SIZE);
state->content->len = 0;
state->content->allocated_size = CONST_INIT_CONTENT_BUFFER_SIZE;
state->wordList->words = (char *)malloc(sizeof(char)*CONST_INIT_WORDLIST_SIZE);
state->wordList->wordsLength = sizeof(char)*CONST_INIT_WORDLIST_SIZE;
state->wordList->wordsPos = 0;
state->wordList->buckets = (Entry *)calloc(CONST_BUCKETS_COUNT, sizeof(Entry));
state->wordList->bucketsCount = CONST_BUCKETS_COUNT;
};
void print_wordlist(ParserState *state){
int i, cnt=0;
Entry *bucket;
for(i=0;i<CONST_BUCKETS_COUNT;i++){
if(state->wordList->buckets[i].wordOffset != 0){
cnt++;
printf("%s\n", &state->wordList->words[state->wordList->buckets[i].wordOffset]);
bucket = state->wordList->buckets[i].next;
while(bucket){
printf("%s\n", &state->wordList->words[bucket->wordOffset]);
bucket = bucket->next;
cnt++;
}
}
}
printf("%d\n", cnt);
}
void print_stats(ParserState *state){
int i, used_bucket_count = 0, collissions_count = 0;
Entry *bucket;
uint32_t max = 1;
uint32_t *occurances;
uint32_t *tmp;
occurances = (uint32_t *)calloc(sizeof(uint32_t),max);
for(i=0;i<CONST_BUCKETS_COUNT;i++){
if(state->wordList->buckets[i].wordOffset != 0){
bucket = &state->wordList->buckets[i];
used_bucket_count++;
if(max<bucket->count){
tmp = (uint32_t *)calloc(sizeof(uint32_t),bucket->count);
memcpy(tmp, occurances, sizeof(uint32_t)*max);
max=bucket->count;
free(occurances);
occurances = tmp;
printf("%9d, %s\n", max, &state->wordList->words[bucket->wordOffset]);
}
occurances[bucket->count - 1]++;
bucket = state->wordList->buckets[i].next;
while(bucket){
if(max<bucket->count){
tmp = (uint32_t *)calloc(sizeof(uint32_t),bucket->count);
memcpy(tmp, occurances, sizeof(uint32_t)*max);
max=bucket->count;
free(occurances);
occurances = tmp;
printf("%9d, %s\n", max, &state->wordList->words[bucket->wordOffset]);
}
occurances[bucket->count - 1]++;
bucket = bucket->next;
collissions_count++;
}
}
}
printf("Stats:\n");
printf("%15s %15s %15s %15s\n", "buckets avail", "buckets used", "colls ct", "words ct");
printf("%15d %15d %15d %15d\n", CONST_BUCKETS_COUNT, used_bucket_count, collissions_count, used_bucket_count + collissions_count);
printf("\n");
for(i=0;i<max;i++){
if(occurances[i]){
printf("%9d %d\n", i+1, occurances[i]);
}
}
}
void free_buckets(Entry *bucket){
if(bucket->next){
free_buckets(bucket->next);
}
free(bucket);
}
void save_wordlist(WordList *wordList){
FILE *fptr;
int i;
Entry *bucket;
fptr = fopen("wordList.bin", "w");
if(!fptr){
printf("the wordlist-file could not be opened.\n");
} else {
for(i=0;i<CONST_BUCKETS_COUNT;i++){
if(wordList->buckets[i].wordOffset){
bucket = &wordList->buckets[i];
if(bucket->count>100){
fprintf(fptr, "%s%c", &wordList->words[bucket->wordOffset], '\0');
}
}
}
//fwrite(wordList->words, 1, wordList->wordsPos, fptr);
fclose(fptr);
}
}
void endDocumentHandler(ParserState *state){
int i;
//print_wordlist(state);
print_stats(state);
save_wordlist(state->wordList);
state->level = 0;
state->state = 0;
free(state->content->buffer);
free(state->wordList->words);
for(i=0;i<CONST_BUCKETS_COUNT;i++){
if(state->wordList->buckets[i].next){
free_buckets(state->wordList->buckets[i].next);
}
}
free(state->wordList->buckets);
}
void appendContent(ParserState *state, const xmlChar *buffer, int len){
while(state->content->allocated_size<state->content->len+len+1){
state->content->allocated_size*=2;
state->content->buffer = (char *)realloc(state->content->buffer, state->content->allocated_size);
}
memcpy(&state->content->buffer[state->content->len], buffer, len);
state->content->len+=len;
state->content->buffer[state->content->len] = 0;
}
void appendWordList(WordList *wordList, char *word){
int len;
len = strlen(word);
while(wordList->wordsPos+len+1>=wordList->wordsLength-1){
printf("resize: %lu, %lu\n", wordList->wordsLength*2, wordList->wordsPos);
wordList->words = (char *)realloc(wordList->words, wordList->wordsLength*2);
wordList->wordsLength*=2;
}
memcpy(&wordList->words[wordList->wordsPos], word, len);
wordList->words[wordList->wordsPos+len] = '\0';
wordList->wordsPos+=len+1;
}
int cmpStrings(char *a, char *b){
while(a[0]&&b[0]){
if(a[0]!=b[0]) return 0;
a++;
b++;
}
return (a[0]==b[0]);
}
void insertUniqe(ParserState *state, char *word){
uint64_t hash;
Entry *bucket;
WordList *wordList;
hash = fnFNV(word)%CONST_BUCKETS_COUNT;
bucket = &state->wordList->buckets[hash];
wordList = state->wordList;
if(bucket->wordOffset == 0){
bucket->wordOffset = wordList->wordsPos;
bucket->count = 1;
} else {
if(cmpStrings(&wordList->words[bucket->wordOffset], word)) { bucket->count +=1; return;}
while(bucket->next){
bucket = bucket->next;
if(cmpStrings(&wordList->words[bucket->wordOffset], word)) { bucket->count +=1; return;}
}
bucket->next = (Entry *)calloc(1, sizeof(Entry));
bucket = bucket->next;
bucket->wordOffset = wordList->wordsPos;
bucket->count = 1;
}
appendWordList(state->wordList, word);
}
void to_lower(char *word){
while(word[0]){
word[0] = tolower(word[0]);
word++;
}
}
void createWordList(ParserState *state, char *text){
char *pch;
char *split = " \"\\/\n\t,-_'<>|().:;=[]{}";
pch = strtok(text, split);
while(pch!=NULL){
//to_lower(pch);
insertUniqe(state, pch);
pch = strtok(NULL, split);
}
}
void startElementHandler(ParserState *state, char *name, char **atts){
if(!strcmp("text", name)){
state->state=2;
}
if(!strcmp("title", name)){
state->state=1;
}
state->level++;
}
void endElementHandler(ParserState *state, char *name, char **atts){
static int narf = 0;
state->level--;
if(state->state==2){
narf++;
createWordList(state, state->content->buffer);
if(!(narf%1000)) printf("narf: %d\n", narf);
}
state->content->len=0;
state->state=0;
}
void characterHandler(ParserState *state, const xmlChar *content, int len){
if(state->state==2){
appendContent(state, content, len);
}
}
static xmlSAXHandler SAXParser = {
0, /* internalSubset */
0, /* isStandalone */
0, /* hasInternalSubset */
0, /* hasExternalSubset */
0, /* resolveEntity */
0, /* getEntity */
0, /* entityDecl */
0, /* notationDecl */
0, /* attributeDecl */
0, /* elementDecl */
0, /* unparsedEntityDecl */
0, /* setDocumentLocator */
(startDocumentSAXFunc)startDocumentHandler, /* startDocument */
(endDocumentSAXFunc)endDocumentHandler, /* endDocument */
(startElementSAXFunc)startElementHandler, /* startElement */
(endElementSAXFunc)endElementHandler, /* endElement */
0, /* reference */
(charactersSAXFunc)characterHandler, /* characters */
0, /* ignorableWhitespace */
0, /* processingInstruction */
0, /* comment */
0, /* warning */
0, /* error */
0, /* fatalError */
};
int main(int argc, char **argv){
ParserState state;
xmlParserCtxtPtr ctxt;
if(argc==2){
ctxt = xmlCreateFileParserCtxt(argv[1]);
if(!ctxt) return EXIT_FAILURE;
ctxt->sax = &SAXParser;
ctxt->userData = &state;
xmlParseDocument(ctxt);
} else {
printf("Fehler!\n");
}
}
|