#include "drw_textcodec.h" #include #include #include #include #include "../drw_base.h" #include "drw_cptables.h" #include "drw_cptable932.h" #include "drw_cptable936.h" #include "drw_cptable949.h" #include "drw_cptable950.h" DRW_TextCodec::DRW_TextCodec() : version{DRW::AC1021} , conv( new DRW_Converter(nullptr, 0) ) { } DRW_TextCodec::~DRW_TextCodec() = default; void DRW_TextCodec::setVersion(DRW::Version v, bool dxfFormat){ switch (v) { case DRW::UNKNOWNV: case DRW::MC00: case DRW::AC12: case DRW::AC14: case DRW::AC150: case DRW::AC210: case DRW::AC1002: case DRW::AC1003: case DRW::AC1004: // unhandled? break; case DRW::AC1006: case DRW::AC1009: { version = DRW::AC1009; cp = "ANSI_1252"; setCodePage( cp, dxfFormat); break; } case DRW::AC1012: case DRW::AC1014: case DRW::AC1015: case DRW::AC1018: { version = DRW::AC1015; // if (cp.empty()) { //codepage not set, initialize ?? cp = "ANSI_1252"; setCodePage( cp, dxfFormat); // } break; } case DRW::AC1021: case DRW::AC1024: case DRW::AC1027: case DRW::AC1032: { // version = DRW::AC1021; // fixme - check why version was limited there... if (dxfFormat) cp = "UTF-8";//RLZ: can be UCS2 or UTF-16 16bits per char else cp = "UTF-16";//RLZ: can be UCS2 or UTF-16 16bits per char setCodePage( cp, dxfFormat); break; } } } void DRW_TextCodec::setVersion(const std::string &v, bool dxfFormat){ version = DRW::UNKNOWNV; for ( auto it = DRW::dwgVersionStrings.begin(); it != DRW::dwgVersionStrings.end(); ++it ) { if ( std::strncmp( v.c_str(), it->first, 32) == 0 ) { version = it->second; setVersion( it->second, dxfFormat); break; } } } void DRW_TextCodec::setCodePage(const std::string &c, bool dxfFormat){ cp = correctCodePage(c); conv.reset(); if (version == DRW::AC1009 || version == DRW::AC1015) { if (cp == "ANSI_874") conv.reset( new DRW_ConvTable(DRW_Table874, CPLENGTHCOMMON) ); else if (cp == "ANSI_932") conv.reset( new DRW_Conv932Table() ); else if (cp == "ANSI_936") conv.reset( new DRW_ConvDBCSTable(DRW_Table936, DRW_LeadTable936, DRW_DoubleTable936, CPLENGTH936) ); else if (cp == "ANSI_949") conv.reset( new DRW_ConvDBCSTable(DRW_Table949, DRW_LeadTable949, DRW_DoubleTable949, CPLENGTH949) ); else if (cp == "ANSI_950") conv.reset( new DRW_ConvDBCSTable(DRW_Table950, DRW_LeadTable950, DRW_DoubleTable950, CPLENGTH950) ); else if (cp == "ANSI_1250") conv.reset( new DRW_ConvTable(DRW_Table1250, CPLENGTHCOMMON) ); else if (cp == "ANSI_1251") conv.reset( new DRW_ConvTable(DRW_Table1251, CPLENGTHCOMMON) ); else if (cp == "ANSI_1253") conv.reset( new DRW_ConvTable(DRW_Table1253, CPLENGTHCOMMON) ); else if (cp == "ANSI_1254") conv.reset( new DRW_ConvTable(DRW_Table1254, CPLENGTHCOMMON) ); else if (cp == "ANSI_1255") conv.reset( new DRW_ConvTable(DRW_Table1255, CPLENGTHCOMMON) ); else if (cp == "ANSI_1256") conv.reset( new DRW_ConvTable(DRW_Table1256, CPLENGTHCOMMON) ); else if (cp == "ANSI_1257") conv.reset( new DRW_ConvTable(DRW_Table1257, CPLENGTHCOMMON) ); else if (cp == "ANSI_1258") conv.reset( new DRW_ConvTable(DRW_Table1258, CPLENGTHCOMMON) ); else if (cp == "UTF-8") { //DXF older than 2007 are write in win codepages cp = "ANSI_1252"; conv.reset( new DRW_Converter(nullptr, 0) ); } else conv.reset( new DRW_ConvTable(DRW_Table1252, CPLENGTHCOMMON) ); } else { if (dxfFormat) conv.reset( new DRW_Converter(nullptr, 0) );//utf16 to utf8 else conv.reset( new DRW_ConvUTF16() );//utf16 to utf8 } } std::string DRW_TextCodec::toUtf8(const std::string &s) { return conv->toUtf8(s); } std::string DRW_TextCodec::fromUtf8(const std::string &s) { return conv->fromUtf8(s); } std::string DRW_Converter::toUtf8(const std::string &s) { std::string result; int j = 0; unsigned int i= 0; for (i=0; i < s.length(); i++) { unsigned char c = s.at(i); if (c < 0x80) { //ascii check for /U+???? if (c == '\\' && i+6 < s.length() && s.at(i+1) == 'U' && s.at(i+2) == '+') { result += s.substr(j,i-j); result += encodeText(s.substr(i,7)); i +=6; j = i+1; } } else if (c < 0xE0 ) {//2 bits i++; } else if (c < 0xF0 ) {//3 bits i +=2; } else if (c < 0xF8 ) {//4 bits i +=3; } } result += s.substr(j); return result; } std::string DRW_ConvTable::fromUtf8(const std::string &s) { std::string result; bool notFound; int code; int j = 0; for (unsigned int i=0; i < s.length(); i++) { unsigned char c = s.at(i); if (c > 0x7F) { //need to decode result += s.substr(j,i-j); std::string part1 = s.substr(i,4); int l; code = decodeNum(part1, &l); j = i+l; i = j - 1; notFound = true; for (int k=0; k 6 && *(it+1) == 'U' && *(it+2) == '+') { res += encodeText(std::string(it, it+7)); it +=6; } else { res +=c; //no \U+ encoded text write } } else res +=c; //c!='\' ascii char write } else {//end c < 0x80 res += encodeNum(table[c-0x80]); //translate from table } } //end for return res; } std::string DRW_Converter::encodeText(const std::string &stmp){ int code; #if defined(__APPLE__) int Succeeded = sscanf (&( stmp.substr(3,4)[0]), "%x", &code ); if ( !Succeeded || Succeeded == EOF ) code = 0; #else std::istringstream sd(stmp.substr(3,4)); sd >> std::hex >> code; #endif return encodeNum(code); } std::string DRW_Converter::decodeText(int c){ std::string res = "\\U+"; std::string num; #if defined(__APPLE__) std::string str(16, '\0'); snprintf (&(str[0]), 16, "%04X", c ); num = str; #else std::stringstream ss; ss << std::uppercase << std::setfill('0') << std::setw(4) << std::hex << c; ss >> num; #endif res += num; return res; } std::string DRW_Converter::encodeNum(int c){ unsigned char ret[5]; if (c < 128) { // 0-7F US-ASCII 7 bits ret[0] = c; ret[1] = 0; } else if (c < 0x800) { //80-07FF 2 bytes ret[0] = 0xC0 | (c >> 6); ret[1] = 0x80 | (c & 0x3f); ret[2] = 0; } else if (c< 0x10000) { //800-FFFF 3 bytes ret[0] = 0xe0 | (c >> 12); ret[1] = 0x80 | ((c >> 6) & 0x3f); ret[2] = 0x80 | (c & 0x3f); ret[3] = 0; } else { //10000-10FFFF 4 bytes ret[0] = 0xf0 | (c >> 18); ret[1] = 0x80 | ((c >> 12) & 0x3f); ret[2] = 0x80 | ((c >> 6) & 0x3f); ret[3] = 0x80 | (c & 0x3f); ret[4] = 0; } return std::string(reinterpret_cast(ret)); } /** 's' is a string with at least 4 bytes length ** returned 'b' is byte length of encoded char: 2,3 or 4 **/ int DRW_Converter::decodeNum(const std::string &s, int *b){ int code= 0; unsigned char c = s.at(0); if ( (c& 0xE0) == 0xC0) { //2 bytes code = ( c&0x1F)<<6; code = (s.at(1) &0x3F) | code; *b = 2; } else if ( (c& 0xF0) == 0xE0) { //3 bytes code = ( c&0x0F)<<12; code = ((s.at(1) &0x3F)<<6) | code; code = (s.at(2) &0x3F) | code; *b = 3; } else if ( (c& 0xF8) == 0xF0) { //4 bytes code = ( c&0x07)<<18; code = ((s.at(1) &0x3F)<<12) | code; code = ((s.at(2) &0x3F)<<6) | code; code = (s.at(3) &0x3F) | code; *b = 4; } return code; } std::string DRW_ConvDBCSTable::fromUtf8(const std::string &s) { std::string result; bool notFound; int code; int j = 0; for (unsigned int i=0; i < s.length(); i++) { unsigned char c = s.at(i); if (c > 0x7F) { //need to decode result += s.substr(j,i-j); std::string part1 = s.substr(i,4); int l; code = decodeNum(part1, &l); j = i+l; i = j - 1; notFound = true; for (int k=0; k> 8; d[1] = data & 0xFF; d[2]= '\0'; result += d; //translate from table notFound = false; break; } } if (notFound) result += decodeText(code); } //direct conversion } result += s.substr(j); return result; } std::string DRW_ConvDBCSTable::toUtf8(const std::string &s) { std::string res; for (auto it=s.begin() ; it < s.end(); ++it ) { bool notFound = true; unsigned char c = *it; if (c < 0x80) { notFound = false; //check for \U+ encoded text if (c == '\\') { if (s.end()-it > 6 && *(it+1) == 'U' && *(it+2) == '+') { res += encodeText(std::string(it, it+7)); it +=6; } else { res +=c; //no \U+ encoded text write } } else res +=c; //c!='\' ascii char write } else if(c == 0x80 ){//1 byte table notFound = false; res += encodeNum(0x20AC);//euro sign } else {//2 bytes ++it; int code = (c << 8) | static_cast(*it); int sta = leadTable[c-0x81]; int end = leadTable[c-0x80]; for (int k=sta; k 0x7F) { //need to decode result += s.substr(j,i-j); std::string part1 = s.substr(i,4); int l; code = decodeNum(part1, &l); j = i+l; i = j - 1; notFound = true; // 1 byte table if (code > 0xff60 && code < 0xFFA0) { result += code - CPOFFSET932; //translate from table notFound = false; } if (notFound && ( code<0xF8 || (code>0x390 && code<0x542) || (code>0x200F && code<0x9FA1) || code>0xF928 )) { for (int k=0; k> 8; d[1] = data & 0xFF; d[2]= '\0'; result += d; //translate from table notFound = false; break; } } } if (notFound) result += decodeText(code); } //direct conversion } result += s.substr(j); return result; } std::string DRW_Conv932Table::toUtf8(const std::string &s) { std::string res; for (auto it=s.begin() ; it < s.end(); ++it ) { bool notFound = true; unsigned char c = *it; if (c < 0x80) { notFound = false; //check for \U+ encoded text if (c == '\\') { if (s.end()-it > 6 && *(it+1) == 'U' && *(it+2) == '+') { res += encodeText(std::string(it, it+7)); it +=6; } else { res +=c; //no \U+ encoded text write } } else res +=c; //c!='\' ascii char write } else if(c > 0xA0 && c < 0xE0 ){//1 byte table notFound = false; res += encodeNum(c + CPOFFSET932); //translate from table } else {//2 bytes ++it; int code = (c << 8) | static_cast(*it); int sta=0; int end=0; if (c > 0x80 && c < 0xA0) { sta = DRW_LeadTable932[c-0x81]; end = DRW_LeadTable932[c-0x80]; } else if (c > 0xDF && c < 0xFD){ sta = DRW_LeadTable932[c-0xC1]; end = DRW_LeadTable932[c-0xC0]; } if (end > 0) { for (int k=sta; k