异常xml修复
今天邻居小伙遇到个问题,上游给的xml里带一些错误出现的未转义的<>符号,导致tinyxml解析出错。短时间没有什么好的替代库或解决方案。
这个问题倒是可以逆向思考一下,与其非常艰难地抓出异常出现的<>符号,倒不如把字符串按照正常xml规则解析一遍,剩下不符合规则的不就是异常字符么,换掉就行。果然手写xml解析或手写json解析是每个程序员练手的必由之路。
在纸上画了一堆流程草图,设计基本数据结构,有点有当初Cpp刷题的感觉了,甚至还搞了C#和Cpp两个版本。这次花的时间有点长,主要是天天调框架调三方库,对底层数据结构都有点不熟练了。
不幸的是,这一堆东西,可能没什么用途,小伙表示还是要参考官方的解决方案才是正道,自己修的xml不知道还有多少幺蛾子,而且这一堆注释都没有的东西鬼看得懂啊。
我想也是,光顾着彪代码了,没怎么顾及工程问题。那这坨代码先留在博客吧,没帮上什么大忙,自己正经工作都延误了。不过有点理解了leecode侠的存在,打代码解决问题的时候确实有点爽。
#include <iostream>
#include <vector>
#include <fstream>
#include <stack>
class MarkItem
{
public:
MarkItem()
{
IsComplete = false;
Index = 0;
Attr = "";
IsLeft = false;
IsEnd = false;
}
bool IsComplete;
int Index;
bool IsLeft;
bool IsEnd;
std::string Attr;
};
int main()
{
//xml的value中有错误的 < >
std::ifstream in("C:\\Users\\Admin\\Downloads\\temp31.xml", std::ios::in);
std::istreambuf_iterator<char> beg(in), end;
std::string content(beg, end);
in.close();
int totalCount = content.size();
int index = 0;
//记录所有的< <\ > />符号
std::vector<MarkItem> markItems;
while (index < totalCount)
{
//修改1
if (content.at(index) == 0x22)
{
index += 1;
while (index < totalCount)
{
char onechar = content.at(index);
if (onechar == '>' || onechar == '<')
{
content.at(index) = '_';
}
if (onechar == 0x22)
{
break;
}
index += 1;
}
}
if (content.at(index) == '<')
{
MarkItem markItem;
markItem.IsLeft = true;
markItem.Index = index;
index += 1;
if (index < totalCount && content.at(index) == '/')
{
markItem.IsEnd = true;
index += 1;
}
else if (index < totalCount && content.at(index) == '?')
{
markItem.IsEnd = false;
index += 1;
}
while (index < totalCount)
{
char onechar = content.at(index);
if (onechar == ' ' || onechar == '/' || onechar == '>' || onechar == '<')
{
break;
}
//修改2
if (onechar == '\n')
{
break;
}
markItem.Attr += onechar;
index += 1;
}
markItems.push_back(markItem);
}
else if (content.at(index) == '>')
{
MarkItem markItem;
markItem.IsLeft = false;
if (index - 1 > 0 && (content.at(index - 1) == '/' || content.at(index - 1) == '?'))
{
markItem.IsEnd = true;
}
markItem.Index = index;
markItems.push_back(markItem);
index += 1;
}
else
{
index++;
}
}
//所有的<xxx />符号尝试进行匹配
for (int i = markItems.size() - 1; i >= 0; i--)
{
if (markItems[i].IsLeft == false && markItems[i].IsEnd == true)
{
for (int k = i; k >= 0; k--)
{
if (markItems[k].IsLeft == true && markItems[k].IsEnd == false && markItems[k].IsComplete == false)
{
markItems[i].IsComplete = true;
markItems[k].IsComplete = true;
break;
}
}
}
}
//基于栈的xml属性匹配队列
std::stack<int> markstack;
std::stack<int> bakcupstack;
for (int i = 0; i < markItems.size(); i++)
{
if (markItems[i].IsLeft == true && markItems[i].IsEnd == false)
{
markstack.push(i);
}
if (markItems[i].IsLeft == true && markItems[i].IsEnd == true)
{
int count = markstack.size();
while (!markstack.empty())
{
int index = markstack.top();
if (markItems[index].IsLeft == true && markItems[index].IsEnd == false
&& markItems[index].IsComplete == false && markItems[index].Attr == markItems[i].Attr)
{
markItems[i].IsComplete = true;
if (i + 1 < markItems.size() && markItems[i + 1].IsLeft == false)
{
markItems[i + 1].IsComplete = true;
}
markItems[index].IsComplete = true;
if (index + 1 < markItems.size() && markItems[index + 1].IsLeft == false)
{
markItems[index + 1].IsComplete = true;
}
markstack.pop();
break;
}
else
{
bakcupstack.push(index);
markstack.pop();
}
if (markstack.empty())
{
int poptime = 0;
while (poptime < count)
{
int temp = bakcupstack.top();
markstack.push(temp);
bakcupstack.pop();
poptime++;
}
bakcupstack.push(i);
}
}
}
}
//替换未匹配的 < > 为 _
for (int i = markItems.size() - 1; i >= 0; i--)
{
if (markItems[i].IsComplete == false)
{
content.at(markItems[i].Index) = '_';
}
}
//保存新文件
std::ofstream ofs;
ofs.open("new.xml", std::ios::out);
ofs << content << std::endl;
ofs.close();
}