爬虫软件开发要用到的代码

发布时间:2025-12-10 11:42:38 浏览次数:17

1.时间戳转为C#格式时间

/// <summary>/// 时间戳转为C#格式时间/// </summary>/// <param name="timeStamp">Unix时间戳格式</param>/// <returns>C#格式时间</returns>private DateTime GetTime(string timeStamp){DateTime dtStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1));long lTime = long.Parse(timeStamp + "0000");TimeSpan toNow = new TimeSpan(lTime);return dtStart.Add(toNow);}

2.获取验证码

public Image GetImg(string url){if (string.IsNullOrWhiteSpace(url)){url = string.Format("http://ms.baihe.com/checkcode/defaultImageService?0.{0}", DateTime.Now.Ticks);}cookieCheckCode = "";var item = new HttpItem(){URL = url,//Encoding = System.Text.Encoding.GetEncoding("GBK"),Method = "get",//IsToLower = false,Expect100Continue = false,//代理时用这个Cookie = _cookie,Timeout = 100000,ReadWriteTimeout = 30000,UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0",ContentType = "text/html",ResultType = ResultType.Byte};var result = http.GetHtml(item);if (result.Cookie != null)cookieCheckCode += result.Cookie.StartsWith("PHPSESSID")? result.Cookie.Substring(result.Cookie.IndexOf(',') + 1): result.Cookie;return byteArrayToImage(result.ResultByte);}private Image byteArrayToImage(byte[] Bytes){using (var ms = new MemoryStream(Bytes)){return Bitmap.FromStream(ms, true);}}

3.正则表使用类

Regex re = new Regex(@"&uid=(\d+)\"" class=\""yellow\"">(.+)</a>", RegexOptions.None);MatchCollection mc = re.Matches(resulthtml.Html);var list = new List<User>();foreach (Match match in mc){var uid = match.Groups[1].Value;var nick = match.Groups[2].Value;list.Add(new User { uid = uid, nickname = nick });}

4.获取时间

public long GetTimeLikeJS(){var st = new DateTime(1970, 1, 1);var t = (DateTime.Now.ToUniversalTime() - st);return (long)t.TotalMilliseconds;}

5.另一线程操作主线程的控件

          this.Invoke((Action)delegate(){this.textBox1.Text = "登录成功";});

 .net 2.0里的匿名委托

this.Invoke((EventHandler)delegate { button.Text = i.ToString(); });

另:BackgroundWorker 组件用来执行诸如数据库事务、文件下载等耗时的异步操作

6.Json对象转换

JArray array = JArray.Parse(json);JToken token = array[0];var total = token["total"];var users = token["list"].Children();foreach (var item in users){var user = JsonConvert.DeserializeObject<UserInfo>(item.ToString());int age = user.age;}

7.赶集网登录

HttpHelper http = new HttpHelper();HttpItem item = null;item = new HttpItem(){URL = string.Format("https://passport.ganji.com/login.php?callback=jQuery{0}&username={1}&password={2}",GetTime(),textBox2.Text,textBox3.Text),//URL Referer = "https://passport.ganji.com/login.php?next=/",//来源URL 可选项 };item.Header.Add("x-requested-with", "XMLHttpRequest"); //主要就是这一句,赶集服务器只接受ajax请求。HttpResult result = http.GetHtml(item); string cookie = result.Cookie.Replace("path=/;", "").Replace(",", "%2c");//登录成功访问我发布的信息测试item = new HttpItem(){URL = "http://www.ganji.com/vip/my_post_list.php",//URL Method = "get",//URL 可选项 默认为Get Cookie = cookie,//字符串Cookie 可选项 };result = http.GetHtml(item);string html = result.Html;textBox1.Text = html + "\r\n" ;

 

8.C# Unicode编码/解码

http://www.cnblogs.com/Rolends/archive/2011/09/22/2185276.html

//如果post josn 乱码或服务器不能正常接收HttpItem item = new HttpItem(){URL = url,// "http://159.142.15.196:8089/api/Users/Post_ErpUsers",//URL 必需项 Method = "post",//URL 可选项 默认为Get IsToLower = false,//得到的HTML代码是否转成小写 可选项默认转小写 Cookie = "",//字符串Cookie 可选项 Referer = "",//来源URL 可选项 // Postdata = json, //System.Web.HttpUtility.UrlEncode(json, Encoding.UTF8),//Post数据 可选项GET时不需要写 Timeout = 100000,//连接超时时间 可选项默认为100000 ReadWriteTimeout = 30000,//写入Post数据超时时间 可选项默认为30000 UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",//用户的浏览器类型,版本,操作系统 可选项有默认值 ContentType = "application/json",//返回类型 可选项有默认值 Allowautoredirect = false,//是否根据301跳转 可选项 PostdataByte = UTF8Encoding.UTF8.GetBytes(json),PostDataType = PostDataType.Byte};

 

9.C#解析Html组件

组件名称:HtmlAgilityPack

xpath获取元素:"//*[@class=\"dc-intro\"]/ul/li"  ,查找class="dc-intro"下ul下的li

HtmlWeb web = new HtmlWeb();var doc = web.Load(string.Format("{0}/soft/{1}.html", baseUrl, id));var node = doc.GetElementbyId("form2");var titleNode = node.SelectSingleNode("//*[@class=\"rr-title dc-title clearfix\"]/h1");string title = titleNode.InnerText;string kfyy = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[0].LastChild.InnerText;//开发语言string slsj = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[1].LastChild.InnerText;//收录时间string sqxy = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[2].LastChild.InnerText;//授权协议string czxt = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[3].LastChild.InnerText;//支持的操作系统string rjpjHtml = node.SelectNodes("//*[@class=\"dc-intro\"]/ul/li")[4].InnerHtml;//软件评级var level = Regex.Matches(rjpjHtml, "xx01.png").Count;//级数string content = node.SelectSingleNode("//*[@class=\"markdown-body entry-content\"]").InnerHtml;string rjsy = node.SelectNodes("//*[@class=\"dc-info\"]/a")[0].Attributes["href"].Value;string rjxz = node.SelectNodes("//*[@class=\"dc-info\"]/a")[1].Attributes["href"].Value;

 

转载于:https://www.cnblogs.com/bqh10086/p/5022207.html

需要做网站?需要网络推广?欢迎咨询客户经理 13272073477