再探Delphi字符串

从其它平台迁移而来

闲来无事，又开始扒拉起Delphi的源码，这次发现一个比较有意思的函数StringCodePage，作用是返回传入字符串的CodePage。至于什么是CodePage，暂且认为是字符编码吧。

先测试一把：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


var
  s1: AnsiString;
  s2: WideString;
  s3: UTF8String;
  cp1, cp2, cp3: Word;
begin
  s1 := '123abc中国';
  s2 := '123abc中国';
  s3 := '123abc中国';
  cp1 := StringCodePage(s1); //936   - GBK(简体中文)
  cp2 := StringCodePage(s2); //1200  - UCS-2LE Unicode 小端序
  cp3 := StringCodePage(s3); //65001 - UTF-8 Unicode
end;

来看下是怎么实现的：

1
2
3
4
5
6
7


function StringCodePage(const S: UnicodeString): Word; overload;
begin
  if S <> '' then
    Result := PWord(PByte(S) - 12)^          // StrRec.codePage
  else
    Result := Word(DefaultUnicodeCodePage);
end;

原来字符串首地址逆向偏移12个字节所存放的Word型数据就是该字符串的CodePage信息。注释里出现了一个StrRec，看一下是何方神圣：

1
2
3
4
5
6


StrRec = packed record
  codePage: Word;  //代码页
  elemSize: Word;  //元素大小
  refCnt: Integer; //引用计数
  length: Integer; //字符串长度
end;

两个Word两个Integer刚好12B，那就看下所有字节吧：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


function BytesToHex(p: PByte; len: Integer): string;
var
  i: Integer;
begin
  Result := '';
  for i := 0 to len - 1 do
    Result := Result + ' ' + IntToHex((p + i)^, 2);
  Result := Trim(Result);
end;

var
  s1: AnsiString;
  s2: WideString;
  s3: UTF8String;
  s4: UnicodeString;
  hex1, hex2, hex3, hex4: string;
begin
  s1 := '123abc中国';
  s2 := '123abc中国';
  s3 := '123abc中国';
  s4 := '123abc中国';
  hex1 := BytesToHex(PByte(s1) - 12, Length(s1) + 12);
  hex2 := BytesToHex(PByte(s2) - 12, Length(s2) * 2 + 12);
  hex3 := BytesToHex(PByte(s3) - 12, Length(s3) + 12);
  hex4 := BytesToHex(PByte(s4) - 12, Length(s4) * 2 + 12);

//hex1: A8 03 01 00 01 00 00 00 0A 00 00 00 31 32 33 61 62 63 D6 D0 B9 FA
//hex2: 86 06 D4 26 5E AC 00 18 10 00 00 00 31 00 32 00 33 00 61 00 62 00 63 00 2D 4E FD 56
//hex3: E9 FD 01 00 01 00 00 00 0C 00 00 00 31 32 33 61 62 63 E4 B8 AD E5 9B BD
//hex4: B0 04 02 00 01 00 00 00 08 00 00 00 31 00 32 00 33 00 61 00 62 00 63 00 2D 4E FD 56
end;

查一下各个字符的编码备用：123abc的ASCII码就不必说了；中的GBK编码D6D0,Unicode编码4E2D，UTF-8编码E4B8AD；国的GBK编码B9FA,Unicode编码56FD，UTF-8编码E59BBD。

先看s1，codePage：A8 03，elemSize：01 00，refCnt：01 00 00 00，length：0A 00 00 00，字符串内容：31 32 33 61 62 63 D6 D0 B9 FA。

字符串内容比较好理解，与具体编码也都一一对上了，但是其它的又是怎么回事呢？用PWord和PInteger去取的话又是没问题的啊！其实是编译器大端、小端的问题，至于大小端问题这里不讨论，知道这里用的是小端就可以了。

按小端来解析s1，codePage：03 A8=936，elemSize：00 01=1，refCnt：00 00 00 01=1，length：00 00 00 0A=10。

s2就比较头大了，前8个字节总是变来变去，length：00 00 00 10=16，字符串内容按小端来解析也是没问题的。

其中有这样一段代码：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22


function _WStrLen(const S: _WideStr): Integer; inline;
{$IFDEF CPU64BITS}
begin
  Result := 0;
  if Pointer(S) <> nil then
    {$IFDEF MSWINDOWS}
    Result := PInteger(PByte(S) - 4)^ shr 1;
    {$ELSE}
    Result := PInteger(PByte(S) - 4)^;
    {$ENDIF}
end;
{$ELSE !CPU64BITS}
begin
  Result := IntPtr(S);
  if Result <> 0 then
    {$IFDEF MSWINDOWS}
    Result := PInteger(PByte(Result - 4))^ shr 1;
    {$ELSE}
    Result := PInteger(PByte(Result - 4))^;
    {$ENDIF}
end;
{$ENDIF !CPU64BITS}

意思是说求WideString的长度时，windows平台下还需要右移一位，其实就是除以2。

s3，codePage：FD E9=65001，elemSize：00 01=1，refCnt：00 00 00 01=1，length：00 00 00 0C=12，字符串内容与UTF-8编码一致。

s4按小端来解析，codePage：04 B0=1200，elemSize：00 02=2，refCnt：00 00 00 01=1，length：00 00 00 08=8，字符串内容UCS-2LE编码一致。

这里发现一个重大问题：WideString和UnicodeString虽然在内容上一样，但具体实现却是不同的。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51


var
  s1: AnsiString;
  s2: WideString;
  s3: UTF8String;
  s4: UnicodeString;
begin
  Writeln('@s1        : ', Cardinal(@s1));
  Writeln('@s1[1]     : ', Cardinal(@s1[1]));
  Writeln('@PByte(s1) : ', Cardinal(@PByte(s1)));
  Writeln('PByte(s1)  : ', Cardinal(PByte(s1)));
  s1 := '123abc中国';
  Writeln('s1 : ', s1);
  Writeln('@s1        : ', Cardinal(@s1));
  Writeln('@s1[1]     : ', Cardinal(@s1[1]));
  Writeln('@PByte(s1) : ', Cardinal(@PByte(s1)));
  Writeln('PByte(s1)  : ', Cardinal(PByte(s1)));
  Writeln('------------------------');
  Writeln('@s2        : ', Cardinal(@s2));
  Writeln('@s2[1]     : ', Cardinal(@s2[1]));
  Writeln('@PByte(s2) : ', Cardinal(@PByte(s2)));
  Writeln('PByte(s2)  : ', Cardinal(PByte(s2)));
  s2 := '123abc中国';
  Writeln('s2 : ', s2);
  Writeln('@s2        : ', Cardinal(@s2));
  Writeln('@s2[1]     : ', Cardinal(@s2[1]));
  Writeln('@PByte(s2) : ', Cardinal(@PByte(s2)));
  Writeln('PByte(s2)  : ', Cardinal(PByte(s2)));
  Writeln('------------------------');
  Writeln('@s3        : ', Cardinal(@s3));
  Writeln('@s3[1]     : ', Cardinal(@s3[1]));
  Writeln('@PByte(s3) : ', Cardinal(@PByte(s3)));
  Writeln('PByte(s3)  : ', Cardinal(PByte(s3)));
  s3 := '123abc中国';
  Writeln('s3 : ', s3);
  Writeln('@s3        : ', Cardinal(@s3));
  Writeln('@s3[1]     : ', Cardinal(@s3[1]));
  Writeln('@PByte(s3) : ', Cardinal(@PByte(s3)));
  Writeln('PByte(s3)  : ', Cardinal(PByte(s3)));
  Writeln('------------------------');
  Writeln('@s4        : ', Cardinal(@s4));
  Writeln('@s4[1]     : ', Cardinal(@s4[1]));
  Writeln('@PByte(s4) : ', Cardinal(@PByte(s4)));
  Writeln('PByte(s4)  : ', Cardinal(PByte(s4)));
  s4 := '123abc中国';
  Writeln('s4 : ', s4);
  Writeln('@s4        : ', Cardinal(@s4));
  Writeln('@s4[1]     : ', Cardinal(@s4[1]));
  Writeln('@PByte(s4) : ', Cardinal(@PByte(s4)));
  Writeln('PByte(s4)  : ', Cardinal(PByte(s4)));
  Readln;
end.

结果如图，不多解释了。