孙建/异常xml修复

Created Wed, 09 Aug 2023 18:15:12 +0800 Modified Thu, 10 Aug 2023 15:29:30 +0800
1039 Words

异常xml修复

  今天邻居小伙遇到个问题,上游给的xml里带一些错误出现的未转义的<>符号,导致tinyxml解析出错。短时间没有什么好的替代库或解决方案。

  这个问题倒是可以逆向思考一下,与其非常艰难地抓出异常出现的<>符号,倒不如把字符串按照正常xml规则解析一遍,剩下不符合规则的不就是异常字符么,换掉就行。果然手写xml解析或手写json解析是每个程序员练手的必由之路。

  在纸上画了一堆流程草图,设计基本数据结构,有点有当初Cpp刷题的感觉了,甚至还搞了C#和Cpp两个版本。这次花的时间有点长,主要是天天调框架调三方库,对底层数据结构都有点不熟练了。

  不幸的是,这一堆东西,可能没什么用途,小伙表示还是要参考官方的解决方案才是正道,自己修的xml不知道还有多少幺蛾子,而且这一堆注释都没有的东西鬼看得懂啊。

  我想也是,光顾着彪代码了,没怎么顾及工程问题。那这坨代码先留在博客吧,没帮上什么大忙,自己正经工作都延误了。不过有点理解了leecode侠的存在,打代码解决问题的时候确实有点爽。


#include <iostream>
#include <vector>
#include <fstream>
#include <stack>

class MarkItem
{

public:
    MarkItem()
    {
        IsComplete = false;
        Index = 0;
        Attr = "";
        IsLeft = false;
        IsEnd = false;
    }

    bool IsComplete;
    int Index;
    bool IsLeft;
    bool IsEnd;
    std::string Attr;
};

int main()
{
    //xml的value中有错误的 < >
    std::ifstream in("C:\\Users\\Admin\\Downloads\\temp31.xml", std::ios::in);
    std::istreambuf_iterator<char> beg(in), end;
    std::string content(beg, end);
    in.close();

    int totalCount = content.size();
    int index = 0;

    //记录所有的<  <\  > />符号
    std::vector<MarkItem> markItems;
    while (index < totalCount)
    {   
        //修改1
        if (content.at(index) == 0x22) 
        {
            index += 1;
            while (index < totalCount)
            {
                char onechar = content.at(index);
                if (onechar == '>' || onechar == '<')
                {
                    content.at(index) = '_';
                }
                if (onechar == 0x22)
                {
                    break;
                }
                index += 1;
            }
        }

        if (content.at(index) == '<')
        {
            MarkItem markItem;
            markItem.IsLeft = true;
            markItem.Index = index;
            index += 1;

            if (index < totalCount && content.at(index) == '/')
            {
                markItem.IsEnd = true;
                index += 1;
            }
            else if (index < totalCount && content.at(index) == '?')
            {
                markItem.IsEnd = false;
                index += 1;
            }

            while (index < totalCount)
            {
                char onechar = content.at(index);
                if (onechar == ' ' || onechar == '/' || onechar == '>' || onechar == '<')
                {
                    break;
                }
                //修改2
                if (onechar == '\n')
                {
                    break;
                }
                markItem.Attr += onechar;
                index += 1;
            }
            markItems.push_back(markItem);
        }
        else if (content.at(index) == '>')
        {
            MarkItem markItem;
            markItem.IsLeft = false;
            if (index - 1 > 0 && (content.at(index - 1) == '/' || content.at(index - 1) == '?'))
            {
                markItem.IsEnd = true;
            }
            markItem.Index = index;
            markItems.push_back(markItem);
            index += 1;
        }
        else
        {
            index++;
        }
    }

    //所有的<xxx />符号尝试进行匹配
    for (int i = markItems.size() - 1; i >= 0; i--)
    {
        if (markItems[i].IsLeft == false && markItems[i].IsEnd == true)
        {
            for (int k = i; k >= 0; k--)
            {
                if (markItems[k].IsLeft == true && markItems[k].IsEnd == false && markItems[k].IsComplete == false)
                {
                    markItems[i].IsComplete = true;
                    markItems[k].IsComplete = true;
                    break;
                }
            }
        }
    }

    //基于栈的xml属性匹配队列
    std::stack<int> markstack;
    std::stack<int> bakcupstack;
    for (int i = 0; i < markItems.size(); i++)
    {
        if (markItems[i].IsLeft == true && markItems[i].IsEnd == false)
        {
            markstack.push(i);
        }

        if (markItems[i].IsLeft == true && markItems[i].IsEnd == true)
        {
            int count = markstack.size();
            while (!markstack.empty())
            {
                int index = markstack.top();
                if (markItems[index].IsLeft == true && markItems[index].IsEnd == false
                    && markItems[index].IsComplete == false && markItems[index].Attr == markItems[i].Attr)
                {
                    markItems[i].IsComplete = true;
                    if (i + 1 < markItems.size() && markItems[i + 1].IsLeft == false)
                    {
                        markItems[i + 1].IsComplete = true;
                    }
                    markItems[index].IsComplete = true;
                    if (index + 1 < markItems.size() && markItems[index + 1].IsLeft == false)
                    {
                        markItems[index + 1].IsComplete = true;
                    }
                    markstack.pop();
                    break;
                }
                else
                {
                    bakcupstack.push(index);
                    markstack.pop();
                }

                if (markstack.empty())
                {
                    int poptime = 0;
                    while (poptime < count)
                    {
                        int temp = bakcupstack.top();
                        markstack.push(temp);
                        bakcupstack.pop();
                        poptime++;
                    }
                    bakcupstack.push(i);
                }
            }
        }
    }

    //替换未匹配的 < > 为 _
    for (int i = markItems.size() - 1; i >= 0; i--)
    {
        if (markItems[i].IsComplete == false)
        {
            content.at(markItems[i].Index) = '_';
        }
    }

    //保存新文件
    std::ofstream ofs;
    ofs.open("new.xml", std::ios::out);
    ofs << content << std::endl;
    ofs.close();
}