/** * This program handles xml files of the form: * * * * ... * ... * ... * * * * The document is assumed to be well-formed and valid. * Three kinds of entries are allowed, * - a very restricted form of a dictionary entry. * - a very unrestricted form of a dictionary entry. * - an entry which can have other entries. * The value of the key attribute is used as the key for the entry in the module. * Note, for a only it's key becomes a SWORD key. * Keys of entries internal to it are not used. * * The entries must be sorted according to an ASCII collation of their bytes. * This should be the same for Latin-1 and for UTF-8 * * Sword will allow for any tags, but only a few have any styling. * * author DM Smith */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _ICU_ #include #include #endif #ifndef NO_SWORD_NAMESPACE using namespace sword; #endif using namespace std; #ifdef _ICU_ UTF8NFC normalizer; int normalized = 0; Latin1UTF8 converter; int converted = 0; #endif //#define DEBUG SWLD *module = NULL; SWKey *currentKey = NULL; bool normalize = true; /** * Determine whether the string contains a valid unicode sequence. * The following table give the pattern of a valid UTF-8 character. * Unicode Range 1st 2nd 3rd 4th * U-00000000 - U-0000007F 0nnnnnnn * U-00000080 - U-000007FF 110nnnnn 10nnnnnn * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn * Note: * 1. The latest UTF-8 RFC allows for a max of 4 bytes. * Earlier allowed 6. * 2. The number of bits of the leading byte before the first 0 * is the total number of bytes. * 3. The "n" are the bits of the unicode codepoint. * This routine does not check to see if the code point is in the range. * It could. * * param txt the text to check * return 1 if all high order characters form a valid unicode sequence * -1 if there are no high order characters. * Note: this is also a valid unicode sequence * 0 if there are high order characters that do not form * a valid unicode sequence * author DM Smith */ int detectUTF8(const char *txt) { unsigned int countUTF8 = 0; int count = 0; // Cast it to make masking and shifting easier const unsigned char *p = (const unsigned char*) txt; while (*p) { // Is the high order bit set? if (*p & 0x80) { // Then count the number of high order bits that are set. // This determines the number of following bytes // that are a part of the unicode character unsigned char i = *p; for (count = 0; i & 0x80; count++) { i <<= 1; } // Validate count: // Count 0: bug in code that would cause core walking // Count 1: is a pattern of 10nnnnnn, // which does not signal the start of a unicode character // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111 // are not legal starts, either if (count < 2 || count > 4) return 0; // At this point we expect (count - 1) following characters // of the pattern 10nnnnnn while (--count && *++p) { // The pattern of each following character must be: 10nnnnnn // So, compare the top 2 bits. if ((0xc0 & *p) != 0x80) return 0; } // Oops, we've run out of bytes too soon: Cannot be UTF-8 if (count) return 0; // We have a valid UTF-8 character, so count it countUTF8++; } // Advance to the next character to examine. p++; } // At this point it is either UTF-8 or 7-bit ascii return countUTF8 ? 1 : -1; } void normalizeInput(SWKey &key, SWBuf &text) { #ifdef _ICU_ int utf8State = detectUTF8(text.c_str()); if (normalize) { // Don't need to normalize text that is ASCII // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8 if (!utf8State) { cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl; converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks converted++; // Prepare for double check. This probably can be removed. // But for now we are running the check again. // This is to determine whether we need to normalize output of the conversion. utf8State = detectUTF8(text.c_str()); } // Double check. This probably can be removed. if (!utf8State) { cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl; } if (utf8State > 0) { SWBuf before = text; normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks if (before != text) { normalized++; } } } #endif } void writeEntry(SWKey &key, SWBuf &text) { #ifdef DEBUG cout << key << endl; #endif module->setKey(key); normalizeInput(key, text); module->setEntry(text); } void linkToEntry(SWBuf &keyBuf, vector &linkBuf) { /* char links = linkBuf.size(); for (int i = 0; i < links; i++) { SWKey tmpkey = linkBuf[i].c_str(); module->linkEntry(&tmpkey); cout << "Linking: " << linkBuf[i] << endl; } */ } // Return true if the content was handled or is to be ignored. // false if the what has been seen is to be accumulated and considered later. bool handleToken(SWBuf &text, XMLTag *token) { // The start token for the current entry; static XMLTag startTag; static SWBuf keyBuf; // Flags to indicate whether we are in a entry, entryFree or superentry static bool inEntry = false; static bool inEntryFree = false; static bool inSuperEntry = false; const char *tokenName = token->getName(); //-- START TAG ------------------------------------------------------------------------- if (!token->isEndTag()) { // If we are not in an "entry" and we see one, then enter it. if (!inEntry && !inEntryFree && !inSuperEntry) { inEntry = !strcmp(tokenName, "entry"); inEntryFree = !strcmp(tokenName, "entryFree"); inSuperEntry = !strcmp(tokenName, "superentry"); if (inEntry || inEntryFree || inSuperEntry) { #ifdef DEBUG cout << "Entering " << tokenName << endl; #endif startTag = *token; text = ""; *currentKey = token->getAttribute("key"); return false; // make tag be part of the output } } } //-- EMPTY and END TAG --------------------------------------------------------------------------------------------- else { // ENTRY end // If we see the end of an entry that we are in, then leave it if ((inEntry && !strcmp(tokenName, "entry" )) || (inEntryFree && !strcmp(tokenName, "entryFree" )) || (inSuperEntry && !strcmp(tokenName, "superentry"))) { #ifdef DEBUG cout << "Leaving " << tokenName << endl; #endif // Only one is false coming into here, // but all must be on leaving. inEntry = false; inEntryFree = false; inSuperEntry = false; text += token->toString(); writeEntry(*currentKey, text); // Since we consumed the text, clear it // and tell the caller that the tag was consumed. text = ""; return true; } } return false; } void usage(const char *app, const char *error = 0) { if (error) fprintf(stderr, "\n%s: %s\n", app, error); fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for the SWORD Project\n"); fprintf(stderr, "\nusage: %s [OPTIONS]\n", app); fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n"); fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n"); fprintf(stderr, " -s <2|4>\t\t max text size per entry(default 4):\n"); fprintf(stderr, " -c \t encipher module using supplied key\n"); fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n"); fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed, and then normalize to NFC"); fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n"); fprintf(stderr, "-z, -Z, and -s are mutually exclusive\n"); exit(-1); } int main(int argc, char **argv) { SWBuf program = argv[0]; fprintf(stderr, "You are running %s: $Rev: 2138 $\n", argv[0]); // Let's test our command line arguments if (argc < 3) { usage(*argv); } // variables for arguments, holding defaults SWBuf path = argv[1]; SWBuf teiDoc = argv[2]; SWBuf compType = ""; SWBuf modDrv = ""; SWBuf recommendedPath = "./modules/lexdict/"; SWBuf cipherKey = ""; SWCompress *compressor = 0; for (int i = 3; i < argc; i++) { if (!strcmp(argv[i], "-z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s"); compType = "ZIP"; modDrv = "zLD"; recommendedPath += "zld/"; } else if (!strcmp(argv[i], "-Z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s"); compType = "LZSS"; recommendedPath += "zld/"; } else if (!strcmp(argv[i], "-s")) { if (compType.size()) usage(*argv, "Cannot specify both -s and -z or -Z"); if (i+1 < argc) { int size = atoi(argv[++i]); if (size == 2) { modDrv = "RawLD"; recommendedPath += "rawld/"; continue; } if (size == 4) { modDrv = "RawLD4"; recommendedPath += "rawld4/"; continue; } } usage(*argv, "-s requires one of <2|4>"); } else if (!strcmp(argv[i], "-N")) { normalize = false; } else if (!strcmp(argv[i], "-c")) { if (i+1 < argc) cipherKey = argv[++i]; else usage(*argv, "-c requires "); } else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } if (!modDrv.size()) { modDrv = "RawLD4"; recommendedPath += "rawld4/"; } #ifndef _ICU_ if (normalize) { normalize = false; cout << program << " is not compiled with support for ICU. Setting -N flag." << endl; } #endif if (compType == "ZIP") { compressor = new ZipCompress(); } else if (compType = "LZSS") { compressor = new LZSSCompress(); } #ifdef DEBUG // cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n"; cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n"; cout << ""; // exit(-3); #endif SWBuf modName = path; int pathlen = path.length(); char lastChar = path[pathlen - 1]; if (lastChar != '/' && lastChar != '\\') { modName += "/"; } modName += "dict"; SWBuf keyBuf; SWBuf entBuf; SWBuf lineBuf; vector linkBuf; if (modDrv == "zLD") { if (zLD::createModule(modName)) { fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); exit(-3); } module = new zLD(modName, 0, 0, 30, compressor); } else if (modDrv == "RawLD") { if (RawLD::createModule(modName)) { fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); exit(-3); } module = new RawLD(modName); } else { if (RawLD4::createModule(modName)) { fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); exit(-3); } module = new RawLD4(modName); } SWFilter *cipherFilter = 0; if (cipherKey.size()) { fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); cipherFilter = new CipherFilter(cipherKey.c_str()); module->AddRawFilter(cipherFilter); } if (!module->isWritable()) { fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" ); exit(-1); } // Let's see if we can open our input file ifstream infile(teiDoc); if (infile.fail()) { fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str()); exit(-2); } currentKey = module->CreateKey(); currentKey->Persist(1); module->setKey(*currentKey); (*module) = TOP; SWBuf token; SWBuf text; bool intoken = false; char curChar = '\0'; while (infile.good()) { curChar = infile.get(); // skip the character if it is bad. infile.good() will catch the problem if (curChar == -1) { continue; } if (!intoken && curChar == '<') { intoken = true; token = "<"; continue; } if (intoken && curChar == '>') { intoken = false; token.append('>'); XMLTag *t = new XMLTag(token.c_str()); if (!handleToken(text, t)) { text.append(*t); } continue; } if (intoken) token.append(curChar); else switch (curChar) { case '>' : text.append(">"); break; case '<' : text.append("<"); break; default : text.append(curChar); break; } } // Force the last entry from the text buffer. //text = ""; //writeEntry(*currentKey, text); delete module; delete currentKey; if (cipherFilter) delete cipherFilter; infile.close(); #ifdef _ICU_ if (converted) fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted); if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized); #endif /* * Suggested module name detection. * Only used for suggesting a conf. * * Various forms of path. * . and .. - no module name given, use "dict". * Or one of the following where z is the module name * and x may be . or .. * z * x/y/z * x/y/z/ * x/y/z/z */ SWBuf suggestedModuleName = path; if (lastChar == '/' || lastChar == '\\') { suggestedModuleName.setSize(--pathlen); } lastChar = suggestedModuleName[pathlen - 1]; if (lastChar == '.') { suggestedModuleName = "???"; } else { /* At this point the suggestion is either * what follows the last / or \ * or the entire string */ const char *m = strrchr(suggestedModuleName.c_str(), '/'); if (!m) { m = strrchr(suggestedModuleName.c_str(), '\\'); } if (m) { suggestedModuleName = m+1; } } recommendedPath += suggestedModuleName; recommendedPath += "/dict"; fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n"); fprintf(stderr, "[%s]\n", suggestedModuleName.c_str()); fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str()); fprintf(stderr, "Description=???\n"); fprintf(stderr, "SourceType=TEI\n"); fprintf(stderr, "Encoding=%s\n", (normalize ? "UTF-8" : "???")); fprintf(stderr, "ModDrv=%s\n", modDrv.c_str()); if (compressor) { fprintf(stderr, "CompressType=%s\n", compType.c_str()); } if (cipherKey.size()) { fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str()); } }