Tommy 碎碎念

Tommy Wu's blog

« 上一篇 | 下一篇 »

讓 SciTE 自動偵測 UTF8 與 UTF16-LE 無 BOM 的檔案
post by tommy @ 16 元月, 2013 15:50

之前改了 SciTE 對於無 BOM 的 UTF16-LE 的自動偵測功能, 用了幾天, 除了修正一點小問題外, 也順便再加上對於無 BOM 的 UTF8 偵測.

Patch 如下:

diff --strip-trailing-cr -Nur a/scite/src/FileWorker.cxx c/scite/src/FileWorker.cxx
--- a/scite/src/FileWorker.cxx 2012-10-22 05:54:31.000000000 +0800
+++ c/scite/src/FileWorker.cxx 2013-01-16 15:15:40.428700300 +0800
@@ -57,7 +57,7 @@
}
 
FileLoader::FileLoader(WorkerListener *pListener_, ILoader *pLoader_, FilePath path_, long size_, FILE *fp_) :
- FileWorker(pListener_, path_, size_, fp_), pLoader(pLoader_), readSoFar(0), unicodeMode(uni8Bit) {
+ FileWorker(pListener_, path_, size_, fp_), pLoader(pLoader_), readSoFar(0), unicodeMode(uni8Bit), withBOM(false) {
jobSize = static_cast<int>(size);
jobProgress = 0;
}
@@ -91,6 +91,10 @@
fp = 0;
unicodeMode = static_cast<UniMode>(
static_cast<int>(convert.getEncoding()));
+ withBOM = convert.getWithBOM();
+ if (unicodeMode == uniUTF8 && !withBOM) {
+ unicodeMode = uniCookie;
+ }
// Check the first two lines for coding cookies
if (unicodeMode == uni8Bit) {
unicodeMode = umCodingCookie;
@@ -107,9 +111,9 @@
}
 
FileStorer::FileStorer(WorkerListener *pListener_, const char *documentBytes_, FilePath path_,
- long size_, FILE *fp_, UniMode unicodeMode_, bool visibleProgress_) :
+ long size_, FILE *fp_, UniMode unicodeMode_, bool withBOM_, bool visibleProgress_) :
FileWorker(pListener_, path_, size_, fp_), documentBytes(documentBytes_), writtenSoFar(0),
- unicodeMode(unicodeMode_), visibleProgress(visibleProgress_) {
+ unicodeMode(unicodeMode_), withBOM(withBOM_), visibleProgress(visibleProgress_) {
jobSize = static_cast<int>(size);
jobProgress = 0;
}
@@ -128,6 +132,7 @@
convert.setEncoding(static_cast<Utf8_16::encodingType>(
static_cast<int>(unicodeMode)));
}
+ convert.setWithBOM(withBOM);
convert.setfile(fp);
std::vector<char> data(blockSize + 1);
int lengthDoc = static_cast<int>(size);
diff --strip-trailing-cr -Nur a/scite/src/FileWorker.h c/scite/src/FileWorker.h
--- a/scite/src/FileWorker.h 2012-10-22 05:54:31.000000000 +0800
+++ c/scite/src/FileWorker.h 2013-01-16 15:16:42.824491700 +0800
@@ -36,6 +36,7 @@
ILoader *pLoader;
long readSoFar;
UniMode unicodeMode;
+ bool withBOM;
 
FileLoader(WorkerListener *pListener_, ILoader *pLoader_, FilePath path_, long size_, FILE *fp_);
virtual ~FileLoader();
@@ -51,10 +52,11 @@
const char *documentBytes;
long writtenSoFar;
UniMode unicodeMode;
+ bool withBOM;
bool visibleProgress;
 
FileStorer(WorkerListener *pListener_, const char *documentBytes_, FilePath path_,
- long size_, FILE *fp_, UniMode unicodeMode_, bool visibleProgress_);
+ long size_, FILE *fp_, UniMode unicodeMode_, bool withBOM_, bool visibleProgress_);
virtual ~FileStorer();
virtual void Execute();
virtual void Cancel();
diff --strip-trailing-cr -Nur a/scite/src/SciTEBase.h c/scite/src/SciTEBase.h
--- a/scite/src/SciTEBase.h 2013-01-16 15:07:51.545409500 +0800
+++ c/scite/src/SciTEBase.h 2013-01-16 15:17:56.512055100 +0800
@@ -97,6 +97,7 @@
bool useMonoFont;
enum { empty, reading, readAll, open } lifeState;
UniMode unicodeMode;
+ bool withBOM;
time_t fileModTime;
time_t fileModLastAsk;
time_t documentModTime;
@@ -109,7 +110,7 @@
enum FutureDo { fdNone=0, fdFinishSave=1 } futureDo;
Buffer() :
RecentFile(), doc(0), isDirty(false), isReadOnly(false), useMonoFont(true), lifeState(empty),
- unicodeMode(uni8Bit), fileModTime(0), fileModLastAsk(0), documentModTime(0),
+ unicodeMode(uni8Bit), withBOM(false), fileModTime(0), fileModLastAsk(0), documentModTime(0),
findMarks(fmNone), pFileWorker(0), futureDo(fdNone) {}
 
void Init() {
@@ -119,6 +120,7 @@
useMonoFont = true;
lifeState = empty;
unicodeMode = uni8Bit;
+ withBOM = false;
fileModTime = 0;
fileModLastAsk = 0;
documentModTime = 0;
diff --strip-trailing-cr -Nur a/scite/src/SciTEIO.cxx c/scite/src/SciTEIO.cxx
--- a/scite/src/SciTEIO.cxx 2012-10-22 05:54:31.000000000 +0800
+++ c/scite/src/SciTEIO.cxx 2013-01-16 15:21:09.199878000 +0800
@@ -320,6 +320,10 @@
 
CurrentBuffer()->unicodeMode = static_cast<UniMode>(
static_cast<int>(convert.getEncoding()));
+ CurrentBuffer()->withBOM = convert.getWithBOM();
+ if (CurrentBuffer()->unicodeMode == uniUTF8 && !CurrentBuffer()->withBOM) {
+ CurrentBuffer()->unicodeMode = uniCookie;
+ }
// Check the first two lines for coding cookies
if (CurrentBuffer()->unicodeMode == uni8Bit) {
CurrentBuffer()->unicodeMode = umCodingCookie;
@@ -335,6 +339,7 @@
// May not be found if load cancelled
if (iBuffer >= 0) {
buffers.buffers[iBuffer].unicodeMode = pFileLoader->unicodeMode;
+ buffers.buffers[iBuffer].withBOM = pFileLoader->withBOM;
buffers.buffers[iBuffer].lifeState = Buffer::readAll;
if (pFileLoader->err) {
GUI::gui_string msg = LocaliseMessage("Could not open file '^0'.", pFileLoader->path.AsInternal());
@@ -954,7 +959,7 @@
if (!(sf & sfSynchronous)) {
wEditor.Call(SCI_SETREADONLY, 1);
const char *documentBytes = reinterpret_cast<const char *>(wEditor.CallReturnPointer(SCI_GETCHARACTERPOINTER));
- CurrentBuffer()->pFileWorker = new FileStorer(this, documentBytes, filePath, lengthDoc, fp, CurrentBuffer()->unicodeMode, (sf & sfProgressVisible));
+ CurrentBuffer()->pFileWorker = new FileStorer(this, documentBytes, filePath, lengthDoc, fp, CurrentBuffer()->unicodeMode, CurrentBuffer()->withBOM, (sf & sfProgressVisible));
CurrentBuffer()->pFileWorker->sleepTime = props.GetInt("asynchronous.sleep");
if (PerformOnNewThread(CurrentBuffer()->pFileWorker)) {
retVal = true;
@@ -968,6 +973,7 @@
convert.setEncoding(static_cast<Utf8_16::encodingType>(
static_cast<int>(CurrentBuffer()->unicodeMode)));
}
+ convert.setWithBOM(CurrentBuffer()->withBOM);
convert.setfile(fp);
char data[blockSize + 1];
retVal = true;
@@ -1145,6 +1151,10 @@
}
CurrentBuffer()->unicodeMode = static_cast<UniMode>(
static_cast<int>(convert.getEncoding()));
+ CurrentBuffer()->withBOM = convert.getWithBOM();
+ if (CurrentBuffer()->unicodeMode == uniUTF8 && !CurrentBuffer()->withBOM) {
+ CurrentBuffer()->unicodeMode = uniCookie;
+ }
// Check the first two lines for coding cookies
if (CurrentBuffer()->unicodeMode == uni8Bit) {
CurrentBuffer()->unicodeMode = umCodingCookie;
diff --strip-trailing-cr -Nur a/scite/src/Utf8_16.cxx c/scite/src/Utf8_16.cxx
--- a/scite/src/Utf8_16.cxx 2012-10-22 05:54:31.000000000 +0800
+++ c/scite/src/Utf8_16.cxx 2013-01-16 15:27:25.508352900 +0800
@@ -12,6 +12,9 @@
#include "Utf8_16.h"
 
#include <stdio.h>
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#endif
 
const Utf8_16::utf8 Utf8_16::k_Boms[][3] = {
{0x00, 0x00, 0x00}, // Unknown
@@ -30,6 +33,7 @@
 
Utf8_16_Read::Utf8_16_Read() {
m_eEncoding = eUnknown;
+ m_bWithBOM = false;
m_nBufSize = 0;
m_pBuf = NULL;
m_pNewBuf = NULL;
@@ -52,6 +56,7 @@
if (m_bFirstRead) {
nSkip = determineEncoding();
m_bFirstRead = false;
+ if (nSkip) m_bWithBOM = true;
}
 
if (m_eEncoding == eUnknown) {
@@ -89,6 +94,75 @@
return pCur - m_pNewBuf;
}
 
+// Returned value :
+// 0 : utf8
+// 1 : 7bits
+// 2 : 8bits
+int Utf8_16_Read::utf8_7bits_8bits()
+{
+ int rv = 1;
+ int ASCII7only = 1;
+ utf8 *sx = (utf8 *)m_pBuf;
+ utf8 *endx = sx + m_nLen;
+
+ while (sx<endx)
+ {
+ // For detection, we'll say that NUL means not UTF8
+ if (!*sx)
+ {
+ ASCII7only = 0;
+ rv = 0;
+ break;
+ }
+ // 0nnnnnnn If the byte's first hex code begins with 0-7, it is an ASCII character.
+ else if (*sx < 0x80)
+ {
+ sx++;
+ }
+ // 10nnnnnn 8 through B cannot be first hex codes
+ else if (*sx < (0x80 + 0x40))
+ {
+ ASCII7only=0;
+ rv=0;
+ break;
+ }
+ // 110xxxvv 10nnnnnn If it begins with C or D, it is an 11 bit character
+ else if (*sx < (0x80 + 0x40 + 0x20))
+ {
+ ASCII7only=0;
+ if (sx>=endx-1)
+ break;
+ if (!(*sx & 0x1F) || (sx[1]&(0x80+0x40)) != 0x80) {
+ rv=0; break;
+ }
+ sx+=2;
+ }
+ // 1110qqqq 10xxxxvv 10nnnnnn If it begins with E, it is 16 bit
+ else if (*sx < (0x80 + 0x40 + 0x20 + 0x10))
+ {
+ ASCII7only=0;
+ if (sx>=endx-2)
+ break;
+ if (!(*sx & 0xF) || (sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80) {
+ rv=0; break;
+ }
+ sx+=3;
+ }
+ // more than 16 bits are not allowed here
+ else
+ {
+ ASCII7only=0;
+ rv=0;
+ break;
+ }
+ }
+ if (ASCII7only)
+ return 1;
+ if (rv)
+ return 0;
+ return 2;
+}
+
int Utf8_16_Read::determineEncoding() {
m_eEncoding = eUnknown;
 
@@ -105,6 +179,15 @@
m_eEncoding = eUtf8;
nRet = 3;
}
+#if defined(_WIN32) || defined(_WIN64)
+ // try to detect UTF-16 little-endian without BOM
+ else if (m_pBuf[0] != 0 && m_pBuf[1] == 0 && IsTextUnicode(m_pBuf, m_nLen, NULL)) {
+ m_eEncoding = eUtf16LittleEndian;
+ }
+#endif
+ else if (utf8_7bits_8bits() == 0) {
+ m_eEncoding = eUtf8;
+ }
}
 
return nRet;
@@ -114,6 +197,7 @@
 
Utf8_16_Write::Utf8_16_Write() {
m_eEncoding = eUnknown;
+ m_bWithBOM = false;
m_pFile = NULL;
m_pBuf = NULL;
m_bFirstWrite = true;
@@ -147,6 +231,9 @@
return ::fwrite(p, _size, 1, m_pFile);
}
 
+ if (m_bWithBOM == false)
+ m_bFirstWrite = false;
+
if (m_eEncoding == eUtf8) {
if (m_bFirstWrite)
::fwrite(k_Boms[m_eEncoding], 3, 1, m_pFile);
@@ -210,6 +297,10 @@
m_eEncoding = eType;
}
 
+void Utf8_16_Write::setWithBOM(bool bWithBOM) {
+ m_bWithBOM = bWithBOM;
+}
+
//=================================================================
Utf8_Iter::Utf8_Iter() {
reset();
diff --strip-trailing-cr -Nur a/scite/src/Utf8_16.h c/scite/src/Utf8_16.h
--- a/scite/src/Utf8_16.h 2012-10-22 05:54:31.000000000 +0800
+++ c/scite/src/Utf8_16.h 2013-01-16 15:28:34.603249000 +0800
@@ -106,9 +106,12 @@
char* getNewBuf() { return reinterpret_cast<char*>(m_pNewBuf); }
 
encodingType getEncoding() const { return m_eEncoding; }
+ bool getWithBOM() const { return m_bWithBOM; }
protected:
int determineEncoding();
+ int utf8_7bits_8bits();
private:
+ bool m_bWithBOM;
encodingType m_eEncoding;
ubyte* m_pBuf;
ubyte* m_pNewBuf;
@@ -125,11 +128,13 @@
~Utf8_16_Write();
 
void setEncoding(encodingType eType);
+ void setWithBOM(bool bWithBOM);
 
void setfile(FILE *pFile);
size_t fwrite(const void* p, size_t _size);
void fclose();
protected:
+ bool m_bWithBOM;
encodingType m_eEncoding;
FILE* m_pFile;
utf16* m_pBuf;

Windws x64 的執行檔放這兒: http://www.teatime.com.tw/~tommy/files/scite323_64.7z
Windws x86 的執行檔放這兒: http://www.teatime.com.tw/~tommy/files/scite323_32.7z
PS. 要用的話, 後果請自行負責吧.... 

對於 UTF8 的檔案, 如果沒有 BOM, 會被自動歸到 UTF-8 那一個 Encoding 上頭 (SciTE 本來就可以存檔成無 BOM 的 UTF-8 檔案) .

對於 UTF16-LE 的檔案, 如果沒有 BOM, 會被歸到 UTF16-LE 那一個 Encoding 上頭, 存檔的時候, 也不會加上 BOM. (對 SciTE 來說, 那一個 Encoding 應該是有 BOM 的, 所以... 如果是產生新的檔案, 選擇這個編碼一定會有 BOM..... 因為要加額外的編碼.... 會動到 UI 的部份, 太麻煩了, 所以... 就不打算改了, 就共用同一個吧)

目前的改法, 至少對於已存在的檔案, 不管原本有無 BOM, 都可以正常辨識, 而且存檔時也不會改變原本 BOM 的狀態.

對於 x64 的版本, 是用 gcc (mingw64) 編譯的 (因為我用 VS2012 編譯出來的檔案, 搭配 gcin 輸入法時, 會沒辦法正確的取得游標的位置). 而 x86 的版本, 因為沒有這個問題, 目前仍是使用 VS2012 編譯出來的.

Del.icio.us Furl HEMiDEMi Technorati MyShare
commons icon [1] Re:讓 SciTE 自動偵測 UTF8 與 UTF16-LE 無 BOM 的檔案 [ 回覆 ]

我是大陆用户wuha。真心感谢您的这篇文章帮助了我。一直纠结于SCITE的编码检测功能不能检测 无BOM的UTF-8,这里感谢您!感谢您的无私奉献

迴響
暱稱:
標題:
個人網頁:
電子郵件:
authimage

迴響

  

Bad Behavior 已經阻擋了 131 個過去 7 天試圖闖關的垃圾迴響與引用。
Power by LifeType. Template design by JamesHuang. Valid XHTML and CSS