php编码
大约 2 分钟
php编码
emoji使用utf-8进行解析
/**
* 检测子级
* @param string $temp 要解析的字符串
* @param int $i 开始解析的字节坐标
* @param int $level 层级 即几个字节
* @return string
*/
function checkSubUTF8($temp, $i, $level = 0)
{
$endU = "";
for ($j = 1; $j < $level; $j++)
{
$result = unpack("H*", $temp[$i + $j]);
if (strpos((base_convert($result[1], 16, 2)), '10') === 0)
{
$endU .= $result[1];
}
else
{
return false;
}
}
return $endU;
}
/**
* 解析UTF8
* @param string $temp 要解析的字符串
* @param int $i 开始解析的字节坐标
* @return array
*/
function checkUTF8($temp, $i)
{
// unpack:二进制字符串对数据进行解包
$startU = unpack("H*", $temp[$i]);
$startU = $startU[1];
// 只处理单字节,规则"0xxxxxxx",base_convert会省略第一个0所以直接判断长度。
if (strlen(base_convert($startU, 16, 2)) == 7)
{
// pack:把数据装入一个二进制字符串
$result_r = '单字节=>' . pack("H*", $startU) . "($startU)<br>";
$result = pack("H*", $startU);
return ['data' => $result, 'level' => 1, 'print' => $result_r];
}
// 只处理了四字节,规则"11110xxx 10xxxxxx 10xxxxxx 10xxxxxx"
elseif (strpos((base_convert($startU, 16, 2)), '11110') === 0)
{
$endU = checkSubUTF8($temp, $i, 4);
if ($endU)
{
$strUTF8 = $startU . $endU;
$result_r = '四字节=>' . pack("H*", $strUTF8) . "($strUTF8)<br>";
$result = pack("H*", $strUTF8);
return ['data' => $result, 'level' => 4, 'print' => $result_r];
}
}
// 只处理了三字节,规则"1110xxxx 10xxxxxx 10xxxxxx"
elseif (strpos((base_convert($startU, 16, 2)), '1110') === 0)
{
$endU = checkSubUTF8($temp, $i, 3);
if ($endU)
{
$strUTF8 = $startU . $endU;
$result_r = '三字节=>' . pack("H*", $strUTF8) . "($strUTF8)<br>";
$result = pack("H*", $strUTF8);
return ['data' => $result, 'level' => 3, 'print'=>$result_r];
}
}
// 只处理了二字节,规则"110xxxxx 10xxxxxx"
elseif (strpos((base_convert($startU, 16, 2)), '110') === 0)
{
$endU = checkSubUTF8($temp, $i, 2);
if ($endU)
{
$strUTF8 = $startU . $endU;
$result_r = '二字节=>' . pack("H*", $strUTF8) . "($strUTF8)<br>";
$result = pack("H*", $strUTF8);
return ['data' => $result, 'level' => 2, 'print' => $result_r];
}
}
return [];
}
$temp = '👨👩👦👦👣❎衿华客Moments';
for ($i = 0; $i < strlen($temp); $i++)
{
// 解析utf-8
$result = checkUTF8($temp, $i);
if ($result)
{
echo $result['data'];
echo $result['print'];
$i += $result['level'] - 1;
}
}
Mac下字符基本操作
安装python字符编码检测插件chardet.
pip chardet
进入python终端操作
f = open("utf-8.txt", "w")
f.write("中国".decode("utf-8").encode("utf-8"))
f.close()
import chardet
result = open("utf-8.txt", "rb").read()
chardet.detect(result)
# {'confidence': 0.7525, 'language': '', 'encoding': 'utf-8'}
初步识别文档编码格式
file -i x.txt
x.txt: text/plain; charset=unknown-8bit
file x.txt
x.txt: Non-ISO extended-ASCII text, with CRLF line terminators
使用bash进行编码转换
iconv -f ASCII -t UTF-8 x.txt
关于Non-ISO extended-ASCII text的解析
- most likely a “text” file from the lack of control characters (byte values 0–31) other than line breaks;
- “extended-ASCII” because there are characters outside the ASCII range (byte values ≥128);
- “non-ISO” because there are characters in the 128–159 range (ISO 8859 reserves this range for control characters).