彻底解决Html工具HtmlAgilityPack 中文乱码问题
作者:小鱼的互联网观察 发布时间:November 23, 2009 分类:互联网观察
对于HtmlAgilityPack,修改HtmlWeb.cs的如下方法(红色为修改代码):
private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc)
{
string cachePath = null;
HttpWebRequest req;
bool oldFile = false;
req = WebRequest.Create(uri) as HttpWebRequest;
req.Method = method;
_fromCache = false;
_requestDuration = 0;
int tc = Environment.TickCount;
if (UsingCache)
{
cachePath = GetCachePath(req.RequestUri);
if (File.Exists(cachePath))
{
req.IfModifiedSince = File.GetLastAccessTime(cachePath);
oldFile = true;
}
}
if (_cacheOnly)
{
if (!File.Exists(cachePath))
{
throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'");
}
if (path != null)
{
IOLibrary.CopyAlways(cachePath, path);
// touch the file
File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
}
_fromCache = true;
return HttpStatusCode.NotModified;
}
if (_useCookies)
{
req.CookieContainer = new CookieContainer();
}
if (PreRequest != null)
{
// allow our user to change the request at will
if (!PreRequest(req))
{
return HttpStatusCode.ResetContent;
}
// dump cookie
// if (_useCookies)
// {
// foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
// {
// HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
// }
// }
}
HttpWebResponse resp;
try
{
resp = req.GetResponse() as HttpWebResponse;
}
catch (WebException we)
{
_requestDuration = Environment.TickCount - tc;
resp = (HttpWebResponse)we.Response;
if (resp == null)
{
if (oldFile)
{
if (path != null)
{
IOLibrary.CopyAlways(cachePath, path);
// touch the file
File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
}
return HttpStatusCode.NotModified;
}
throw;
}
}
catch(Exception)
{
_requestDuration = Environment.TickCount - tc;
throw;
}
// allow our user to get some info from the response
if (PostResponse != null)
{
PostResponse(req, resp);
}
_requestDuration = Environment.TickCount - tc;
_responseUri = resp.ResponseUri;
bool html = IsHtmlContent(resp.ContentType);
System.Text.Encoding respenc;
if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length>0))
{
respenc = System.Text.Encoding.GetEncoding(resp.ContentEncoding);
}
else if ((resp.CharacterSet != null) && (resp.CharacterSet.Length > 0))//根据Content-Type中获取的charset
{
if (string.Compare(resp.CharacterSet, "ISO-8859-1", true, System.Globalization.CultureInfo.InvariantCulture) == 0)
respenc = System.Text.Encoding.GetEncoding("GB2312");
else
respenc = System.Text.Encoding.GetEncoding(resp.CharacterSet);
}
else
{
respenc = System.Text.Encoding.GetEncoding("GB2312");
}
if (resp.StatusCode == HttpStatusCode.NotModified)
{
if (UsingCache)
{
_fromCache = true;
if (path != null)
{
IOLibrary.CopyAlways(cachePath, path);
// touch the file
File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
}
return resp.StatusCode;
}
else
{
// this should *never* happen...
throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
}
}
Stream s = resp.GetResponseStream();
if (s != null)
{
if (UsingCache)
{
// NOTE: LastModified does not contain milliseconds, so we remove them to the file
SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize);
// save headers
SaveCacheHeaders(req.RequestUri, resp);
if (path != null)
{
// copy and touch the file
IOLibrary.CopyAlways(cachePath, path);
File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
}
}
else
{
// try to work in-memory
if ((doc != null) && (html))
{
if (respenc != null)
{
doc.Load(s, respenc);
}
else
{
doc.Load(s, true);
}
}
}
resp.Close();
}
return resp.StatusCode;
}
相关文章
- 无相关文章
- 版权声明:自由转载-非商用-非衍生-保持署名 | Creative Commons BY-NC-ND 3.0
- 原文网址:https://tianmeng.org/archives/173/
- 最后修改时间:2009年11月23日 15:11:05
当前暂无评论 »