用于规范的接口
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using ZmjTool;
namespace ZmjConvert
{
/// <summary>
/// 用于规范的接口
/// </summary>
/// <typeparam name="T"></typeparam>
public interface IElement<Element> where Element : class
{
/// <summary>
/// 标签在字符串中的起始位置
/// </summary>
int Starst { get; }
/// <summary>
/// 标签字符串的总长度
/// </summary>
int Lengthl { get; }
/// <summary>
/// 查找子控件中名称为tag的所有子控件
/// </summary>
/// <param name="v">子控件的名称比如div</param>
/// <returns></returns>
Element[] this[string v] { get; }
/// <summary>
/// 父对象
/// </summary>
Element Parents { get; set; }
/// <summary>
/// element的名称,比如a
/// </summary>
string Tag { get; }
/// <summary>
/// 节点的当前内容,不包括其子节点的内容
/// </summary>
string InnerText { get; set; }
/// <summary>
/// 包含所有的子节点
/// </summary>
List<Element> Childrens { get; }
/// <summary>
/// 所有的节点的属性
/// </summary>
Dictionary<string, string> Attributes { get; }
}
/// <summary>
/// 一个类,实现了基本的转换方法,只支持浅显的html解析,支持浅显的xml接续
/// </summary>
/// <typeparam name="T"></typeparam>
public class CHtmlElement<Element> : IElement<Element> where Element : CHtmlElement<Element>, new()
{
/// <summary>
/// 标签在字符串中的起始位置
/// </summary>
public int Start { get; private set; }
/// <summary>
/// 标签字符串的总长度
/// </summary>
public int Length { get; private set; }
/// <summary>
/// 表示当前标签在解析过程中是否找到他的反向标签截止标记
/// </summary>
public bool Closed { get; set; } = false;
/// <summary>
/// 表示当前是否是标签的反向标签,即</tag>模式
/// </summary>
private bool ALabel { get; set; } = false;
/// <summary>
/// 内部使用的用于标记是否是标签的标志
/// </summary>
private bool NoLabel { get; set; } = false;
/// <summary>
/// 查找子控件中名称为tag的所有子控件
/// </summary>
/// <param name="tag">子控件的名称比如div</param>
/// <returns></returns>
public Element[] this[string tag] => Children.Where(x => tag == null || x.TagName == tag).ToArray();
/// <summary>
/// 获取指定的所有ele
/// </summary>
/// <param name="src"></param>
/// <param name="tag"></param>
/// <returns></returns>
public static IEnumerable<Element> GetAllElements(IEnumerable<Element> src, string tag)
{
var array = new List<Element>();
foreach (var item in src)
{
array.AddRange(item[tag]);
array.AddRange(GetAllElements(item.Children.ToArray(), tag));
}
return array;
}
/// <summary>
/// 父对象
/// </summary>
public Element Parents { get; set; }
/// <summary>
/// element的名称,比如a
/// </summary>
public string Tag { get; set; } = string.Empty;
/// <summary>
/// 节点的当前内容,不包括其子节点的内容
/// </summary>
public string Inner { get; set; } = string.Empty;
/// <summary>
/// 包含所有的子节点
/// </summary>
public List<Element> Childrens { get; private set; } = new List<Element>();
/// <summary>
/// 所有的节点的属性
/// </summary>
public Dictionary<string, string> Attributes { get; private set; } = new Dictionary<string, string>();
/// <summary>
/// 一个抽象方法,用来生成XmlClass类
/// </summary>
/// <param name="par"></param>
/// <param name="name"></param>
/// <param name="txt"></param>
/// <param name="atts"></param>
/// <param name="elements"></param>
/// <returns></returns>
public static Elements CreateNew(Element par, string name, string txt, Dictionary<string, string> atts, Element[] elements, bool isc)
{
return new Element()
{
Parent = par,
TagName = name,
InnerText = txt,
Children = elements?.ToList() ?? new List<Element>(),
Attributes = atts ?? new Dictionary<string, string>(),
IsClosed = isc,
};
}
/// <summary>
/// 获取可能存在的编码指示返回null表示没有编码提示,否则返回响应的编码
/// </summary>
/// <param name="element"></param>
/// <returns></returns>
public static Encoding GetEncoding(IEnumerable<Element> element)
{
try {
var e = element.FirstOrDefault(x => x.Attributes.ContainsKey("charset") || x.Attributes.ContainsKey("encoding"));
if (e == null) return null;
var ecd = e.Attributes.ContainsKey("encoding") ? e.Attributes["encoding"] : e.Attributes["charset"];
return Encoding.GetEncoding(ecd);
}
catch (Exception) { return null; }
}
/// <summary>
/// 序列化方式修改
/// </summary>
/// <returns></returns>
public override string ToString()
{
return $"<{TagName}>{InnerTexts}" + (IsClosed ? $"</{TagName}>" : "");
}
/// <summary>
/// 将string序列化为html/xml标签对象
/// </summary>
/// <param name="htmlstr"></param>
/// <returns></returns>
public static IEnumerable<Element> Parse(string htmlstr, int idx = 0)
{
var lst = new List<Element>();
Element lb;
while ((lb = GetNextElement(htmlstr, idx)) != null)
{
if (lb.NotALabel)
{//遇到非标签但是<...>的情况
idx = lb.Start + lb.Length;
continue;
}
if (lb.ACloseLabel)
{//反向标签的处理
var tb = lst.LastOrDefault(x => x.TagName == lb.TagName && !x.IsClosed);//找到最近的未关闭的标签头
if (tb is null)
{//未找到最近的标签头表示一个非法的标签尾
idx = lb.Start + lb.Length;
continue;
}
var id = lst.IndexOf(tb);
tb.Children.AddRange(lst.Skip(id + 1));
lst.RemoveRange(id + 1, lst.Count - id - 1);
if (tb.Children.Count < 1) tb.InnerText = htmlstr.Substring(tb.Start + tb.Length, lb.Start - tb.Start - tb.Length);
else
{//插入一些空的标签来区分不同区域的内容
var fe = tb.Children.First();
if (fe.Start - 1 > tb.Start + tb.Length) tb.Children.Insert(0,new Element { InnerText = htmlstr.Substring(tb.Start + tb.Length, fe.Start - tb.Start - tb.Length), Start = tb.Start + tb.Length, Length = fe.Start - tb.Start - tb.Length });
for (int i = 0; i < tb.Children.Count - 1; i++)
{
var s = tb.Children[i];
var e = tb.Children[i + 1];
if (e.Start - 1 > s.Start + s.Length)
{
tb.Children.Insert(i + 1, new Element { InnerText = htmlstr.Substring(s.Start + s.Length, e.Start - s.Start - s.Length), Start = s.Start + s.Length, Length = e.Start - s.Start - s.Length });
i++;
}
}
fe = tb.Children.Last();
if (lb.Start - 1 > fe.Start + fe.Length) tb.Children.Add(new Element { InnerText = htmlstr.Substring(fe.Start + fe.Length, lb.Start - fe.Start - fe.Length), Start = fe.Start + fe.Length, Length = lb.Start - fe.Start - fe.Length });
//var sb = new StringBuilder();//以下方式只取得标签之间的内容,
//if (tb.Children.FirstOrDefault() is Element se) sb.Append(htmlstr.Substring(tb.Start + tb.Length, se.Start - tb.Start - tb.Length));
//for (int i = 0; i < tb.Children.Count - 1; i++)
//{
// var s = tb.Children[i];
// var e = tb.Children[i + 1];
// sb.Append(htmlstr.Substring(s.Start + s.Length, e.Start - s.Start - s.Length));
//}
//if (tb.Children.LastOrDefault() is Element ee) sb.Append(htmlstr.Substring(ee.Start + ee.Length, lb.Start - ee.Start - ee.Length));
//tb.InnerText = sb.ToString();
}
tb.Length = lb.Start + lb.Length - tb.Start;
tb.Children.ForEach(x => x.Parent = tb);//设置父对象
tb.IsClosed = true;
idx = tb.Start + tb.Length;
continue;
}
lst.Add(lb);
idx = lb.Start + lb.Length;
}
return lst;
}
/// <summary>
/// 匹配标签头
/// </summary>
public static readonly Regex CdataHeadReg = new Regex(@"<\!\[CDATA\[.*?\]\]>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
public static readonly Regex XmlCmdHeadReg = new Regex(@"<\?[A-Za-z].*?\?>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
public static readonly Regex NoteHeadReg = new Regex(@"<\!--.*?-->", RegexOptions.Compiled | RegexOptions.IgnoreCase);
public static readonly Regex HtmlHeadReg = new Regex(@"<[A-Za-z].*?>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
public static readonly Regex HtmlElementReg = new Regex(@"<[\!\?\/a-zA-Z].*?>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
public static readonly Regex HtmlElementEndReg = new Regex(@"<\/[A-Za-z].*?>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
/// <summary>
/// 将指定的html字符串序列化为对象
/// </summary>
/// <param name="parent">父对象</param>
/// <param name="src">原始字符串</param>
/// <param name="idx">指定开始的地方</param>
/// <returns></returns>
public static Element GetNextElement(string src, int idx)
{
var hh = HtmlElementReg.Match(src, idx);//先检查任意一个<...>标记
if (hh is null || !hh.Success) return null;
Match th;
if (hh.Value.StartsWith("<!--"))
{////<!-- dddd -->注释必须重新完整匹配
th = NoteHeadReg.Match(src, idx);
if (th != null && th.Success) return new Element
{
InnerText = th.Value.Substring(4, th.Value.Length - 7),
Length = th.Length,
Start = th.Index,
TagName = "!--",
IsClosed = true,
};
}
if (hh.Value.StartsWith("<?"))
{//这是xml中的pi指令,形如<?...?>,必须重新完整匹配
th = XmlCmdHeadReg.Match(src, idx);
if(th != null && th.Success)
{//xml命令的匹配
var xtag = GetTagNameFromHead(th.Value.Trim('<').Trim('>').Trim('?'), out var xatts);
return new Element
{
Attributes = GetAttsFromStr(xatts),
Length = th.Length,
Start = th.Index,
TagName = xtag,
IsClosed = true,
};
}
}
if (hh.Value.StartsWith("<![CDATA["))
{//xml中的转义字符<![CDATA[[...]]>,必须重新匹配
th = CdataHeadReg.Match(src, idx);
return new Element
{
InnerText = th.Value.Substring(10, th.Value.Length - 13),
Length = th.Length,
Start = th.Index,
TagName = "![CDATA[",
IsClosed = true,
};
}
th = HtmlElementEndReg.Match(hh.Value);
if (th != null && th.Success) return new Element//表示标签的反向标签截止标记
{//反向标记返回
ACloseLabel = true,
TagName = hh.Value.Trim('<').Trim('>').Trim('/').Trim(),
Length = hh.Length,
Start = hh.Index,
};
th = HtmlHeadReg.Match(hh.Value);
if (th is null || !th.Success) return new Element
{//非正常标签返回
NotALabel = true,
Length = hh.Length,
Start = hh.Index,
};
var head = hh.Value.Trim('<').Trim('>');
var tag = GetTagNameFromHead(head, out var atts);
if (head.EndsWith("/")) return new Element//形如<br />
{
Attributes = GetAttsFromStr(atts),
Length = hh.Length,
Start = hh.Index,
TagName = tag,
IsClosed = true,
};
var tidx = hh.Index + hh.Length;
if (tag.Contains("script") || tag.Contains("style"))
{
var ed = src.IndexOf($"</{tag}>", tidx);
return new Element
{
Attributes = GetAttsFromStr(atts),
Length = ed < 0 ? hh.Length : ed - hh.Index + tag.Length + 3,
Start = hh.Index,
TagName = tag,
InnerText = ed < 0 ? string.Empty : src.Substring(tidx, ed - tidx),
IsClosed = true,
};
}
return new Element//形如<a ...></a>
{
Length = hh.Length,
Start = hh.Index,
TagName = tag,
Attributes = GetAttsFromStr(atts),
IsClosed = false,
};
}
/// <summary>
/// 获取下一个标签的标志,分离出att属性,和直到标签开始前的所有字符串,用于正则表达式的解析
/// </summary>
/// <param name="src">原始src字符串</param>
/// <param name="idx">开始的地方</param>
/// <param name="ptxt">从idx开始到标签标志开始地方的所有内容</param>
/// <param name="tag"></param>
/// <param name="atts"></param>
/// <returns>找到了任何标签标志返回true</returns>
[Obsolete("请使用同名的另一个方法")]
public static bool GetNextLabel(string src, ref int idx, out string ptxt, out string tag, out string atts, out bool isclose)
{
string head = GetHeadFromSrc(src, ref idx, out ptxt, out int st);
if (head == null)//不能通过string.IsNullOrWhiteSpace去判断
{
tag = string.Empty;
atts = string.Empty;
isclose = true;
return false;//缺少<>的
}
tag = GetTagNameFromHead(head, out atts);
if (head.StartsWith("?"))
{//这是xml中的pi指令,必须以?>结尾,在html解析中会出错
idx = st + 2;
ptxt = GetInnerText(src, ref idx, "?>");
tag = "?";
isclose = true;
return true;
}
if (head.StartsWith("!--"))
{//<!-- dddd -->这种标签
idx = st + 4;
ptxt = GetInnerText(src, ref idx, "-->");
tag = "!--";
atts = string.Empty;
isclose = true;
return true;
}
if (head.StartsWith("![CDATA["))
{//xml中的转义字符<![CDATA[[]>
idx = st + 9;
ptxt = GetInnerText(src, ref idx, "]]>");
var res = GetNextLabel(src, ref idx, out string pt, out tag, out atts, out isclose);
ptxt += pt;
return res;
}
if (string.IsNullOrWhiteSpace(tag) || !IsAXmlWord(tag))
{//非法的标签情况,则会被忽略,这里必须矫正idx
idx = st + 1;
var res = GetNextLabel(src, ref idx, out string pt, out tag, out atts, out isclose);
ptxt += pt;
return res;
}
if (tag == "script" || tag == "style")
{//html中的脚本
int sst = idx - 1;
int hlen = head.Length;
do {//这里也必须矫正idx
idx = sst + 1;
head = GetHeadFromSrc(src, ref idx, out _, out sst);
}//如果找不到应该是格式不正确的页面
while (!head.ToLower().StartsWith($"/{tag}"));
ptxt += src.Substring(st + hlen + 2, sst - st - hlen - 2);
idx = sst + head.Length + 2;
isclose = true;
return head != null;
}
isclose = head.EndsWith("/");
return true;//<a ...></a>或者<a...>
}
/// <summary>
/// 判断一个字符串是否符合html标签的认定方式,主要判断首字符
/// 首字符必须是a-z,?!/其中之一,一般不限制其长度,且不能包含 (空格)>等字符
/// </summary>
/// <param name="word"></param>
/// <returns></returns>
public static bool IsAXmlWord(string word)
{
var c = word.FirstOrDefault();
return (c == '?' || c == '!' || c == '/' || (c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a')) && !(word.Contains(' ') || word.Contains('>'));
//if (word.Count(x => x == '?') > 1 || word.Count(x => x == '!') > 1 || word.Count(x => x == '/') > 1) return false;
//word = word.TrimStart('?').TrimStart('!').TrimStart('/').TrimEnd('/');
//return word.ToLower().All(x => (x >= 'a' && x <= 'z') || x == ':' || x == '-' || x == '_' || (x <= '9' && x >= '0'));
}
/// <summary>
/// 用先入后出的正则表达式方式接续字符串
/// </summary>
/// <param name="src"></param>
/// <param name="idx"></param>
/// <returns></returns>
//[Obsolete("请使用同名方法替换")]
//public static IEnumerable<Element> Parse(string src, int idx = 0)
//{
// var ls = new List<Element>();
// while (GetNextLabel(src, ref idx, out var ptxt, out var tag, out var atts, out var isclose))
// {
// if (tag.StartsWith("/"))
// {//出栈
// if (!TransAsChild(ls, tag.TrimStart('/').Trim(), ptxt))
// ls.Last().InnerText += ptxt;
// }//入栈
// else
// {//当script和style没有被正确关闭时,期间的所有内容都视为是脚本
// //var lst = ls.LastOrDefault();
// //if (ls.Count > 0 && !lst.IsClosed && (lst.TagName == "script" || lst.TagName == "style"))
// //{
// // lst.InnerText += ptxt;
// //}
// //else ls.Add(CreateNew(null, tag, ptxt.Trim(), GetAttsFromStr(atts), null, isclose));
// ls.Add(CreateNew(null, tag, ptxt.Trim(), GetAttsFromStr(atts), null, isclose));
// }
// }
// return ls;
//}
/// <summary>
/// 执行出栈操作,将tag指定的ele后面的元素全部变成ele的child,
/// 如果没找到相关元素,则应该视为无效值作为文本处理。没找到元素时返回false
/// </summary>
/// <param name="src"></param>
/// <param name="tag"></param>
/// <returns></returns>
[Obsolete("使用新方法替代")]
public static bool TransAsChild(List<Element> src, string tag, string ptxt)
{
var st = src.FindLastIndex(x => x.TagName == tag && !x.IsClosed);
if (st < 0) return false;
var current = src.ElementAt(st++);
current.InnerText += ptxt.Trim();
current.Children = src.Skip(st).ToList();
current.IsClosed = true;//设置为已经关闭
foreach (var item in current.Children)
{
item.Parent = current;
src.Remove(item);
}
return true;
}
/// <summary>
/// 从指定字符串分离出标签头部,如果找不到标签头部,则返回空
/// </summary>
/// <param name="src"></param>
/// <returns></returns>
[Obsolete("使用新方法替代")]
public static string GetHeadFromSrc(string src, ref int idx, out string txt, out int st)
{
if (idx < 0 || idx >= src.Length)
{
txt = string.Empty;
st = idx;
return null;
}
st = src.IndexOf('<', idx);
int ed = src.IndexOf('>', st + 1);
if (ed < 0 || st < 0 || st > ed)
{
txt = src.Substring(idx);
idx = src.Length;
return null;
}
string head = src.Substring(st + 1, ed - st - 1);
txt = src.Substring(idx, st - idx);
idx = ed + 1;
return head;
}
/// <summary>
/// 指定一个反向标签查找出idx开始到endtag之间的所有内容,没有endtag标识时返回空
/// </summary>
/// <param name="src"></param>
/// <param name="endtag"></param>
/// <returns></returns>
[Obsolete("使用新方法替代")]
public static string GetInnerText(string src, ref int idx, string endtag)
{
if (idx < 0 || idx >= src.Length) return string.Empty;
var st = src.IndexOf(endtag, idx);
var tx = (st < 0) ? string.Empty : src.Substring(idx, st - idx);
idx = (st < 0) ? idx : st + endtag.Length;
return tx;
}
/// <summary>
/// 从head中分离出tag名称
/// </summary>
/// <param name="head"></param>
/// <returns></returns>
public static string GetTagNameFromHead(string head, out string atts)
{
head = head.Trim().TrimEnd('/').Trim();
int hst = head.IndexOf(' ');
string tag = (hst > 0) ? head.Substring(0, hst) : head;
atts = (hst > 0) ? head.Substring(hst + 1).TrimEnd('?').Trim() : string.Empty;
return tag.Trim().ToLower();
}
/// <summary>
/// 从字符串中解析出所有的属性
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public static Dictionary<string, string> GetAttsFromStr(string str)
{
var array = new Dictionary<string, string>();
int idx = 0;
//while (idx < str.Length)
//{
// var nm = GetNextAtt(str, ref idx, out var val)?.ToLower();
// if (!string.IsNullOrWhiteSpace(nm) && !array.ContainsKey(nm)) array.Add(nm, val);
//}
CHtmlAttrMsg? a;
while ((a = GetNextAtt(str, idx)).HasValue)
{
if (!array.ContainsKey(a.Value.Name)) array.Add(a.Value.Name, a.Value.Value);
else array[a.Value.Name] = a.Value.Value;
idx = a.Value.Start + a.Value.Length;
}
return array;
}
/// <summary>
/// 序列化属性值时解读到的每个属性的相关信息
/// </summary>
public readonly struct CHtmlAttrMsg
{
/// <summary>
/// 匹配每一个属性
/// </summary>
public static readonly Regex AttrReg = new Regex("([\\w-_]+) *=* *('.*?')*(\".*?\")*", RegexOptions.Compiled | RegexOptions.IgnoreCase);
/// <summary>
/// 匹配结果
/// </summary>
private readonly Match label;
/// <summary>
/// 必须提供一个Match,使用匹配
/// </summary>
/// <param name="lb"></param>
public CHtmlAttrMsg(Match lb)
{
label = lb;
}
/// <summary>
/// 内容的起始位置
/// </summary>
public int Start => label.Index;
/// <summary>
/// 整个内容占用的长度
/// </summary>
public int Length => label.Length;
/// <summary>
/// 当前属性的名称
/// </summary>
public string Name => label.Groups[1].Value;
/// <summary>
/// 当前属性的值
/// </summary>
public string Value => label.Groups[2].Value.Trim('\'').Trim() + label.Groups[3].Value.Trim('"').Trim();
}
/// <summary>
/// 使用正则表达式方式解析标签的属性
/// </summary>
/// <param name="str"></param>
/// <param name="idx"></param>
/// <returns></returns>
public static CHtmlAttrMsg? GetNextAtt(string str, int idx)
{
var s = CHtmlAttrMsg.AttrReg.Match(str, idx);
if (s is null || !s.Success) return null;
return new CHtmlAttrMsg(s);
}
/// <summary>
/// 从指定位置向后获取一个属性名
/// </summary>
/// <param name="str"></param>
/// <param name="idx"></param>
/// <returns></returns>
[Obsolete("请使用新的同名方法替代")]
public static string GetNextAtt(string str, ref int idx, out string val)
{
if(idx < 0 || idx >= str.Length - 1)
{
idx = str.Length;
val = string.Empty;
return string.Empty;
}//找一个结尾,以 或者=结束,或者一个开始双引号开始的都不能是属性名
var st = str.IndexOfAny("= '\"".ToArray(), idx);
if (st < 0)
{
var v = str.Substring(idx);
idx = str.Length;
val = string.Empty;
return v;
}
if (str[st] == '\'' || str[st] == '"')
{//属性名不能是单引号双引号开头的
var ed1 = str.IndexOf(str[st], st + 1);
if (ed1 < 0)
{//没找到与之对应的双引号之一,格式错误
idx = str.Length;
val = string.Empty;
return string.Empty;
}
idx = ed1 + 1;//排除最后的双引号
val = str.Substring(st + 1, ed1 - st - 1);
return string.Empty;
}//读取属性名
var vv = str.Substring(idx, st - idx).ToLower().Trim();
if (string.IsNullOrWhiteSpace(vv))
{//如果属性名为空,则递归,但是递归之前必须idx+1
idx++;
return GetNextAtt(str, ref idx, out val);
}
var st1 = str.IndexOf('=', st);//查找=号
if (st1 < 0)
{//没找到=号就是没有value
idx = st + 1;
val = string.Empty;
return vv;
}
st = StringTool.IndexOfChar(str, c => c != ' ', st1 + 1);//去除空格
if (str[st] == '\'' || str[st] == '\"')//找到了双引号之一
{//=号后面找到了"'之一,
var ed1 = str.IndexOf(str[st], st + 1);
if (ed1 < 0)
{//没找到与之对应的双引号之一,格式错误
idx = st + 1;
val = string.Empty;
return vv;
}
idx = ed1 + 1;//排除最后的双引号
val = str.Substring(st + 1, ed1 - st - 1);
return vv;
}//=号后面没有双引号,而是直接是值的情况
var ed = str.IndexOf(' ', st);//找到结尾标识空格,以此作为值
if (ed < 0) ed = str.Length - 1;//没找到空格则可能是到末尾了
idx = ed + 1;//排除最后的空格
val = str.Substring(st, ed - st);//截取=之后第一个非空格到下一个空格之前的内容
return vv;
}
}
}