大致思路
使用的工具为Httpclient
主要分为两种:1、不需要登录就能访问的网页 2、需要登录才可以访问的网页
1、不需要登录的网页
①构建浏览器
②确定访问的uri
③获取到响应并解析
2、需要登录才可以访问的网页
①构建浏览器
②获取cookie(一般情况下只需要cookie,有的时候还需要其他的请求头,例如authorization
请求头)
第一种:静态获取:直接在浏览器中复制即可(有时候cookie没有用,换一个浏览器可能可以)
第二种:模拟程序获取:如果不知道登录的请求uri和参数,可以先在浏览器中输入错误的账号、密码,得到用于登录的请求uri以及需要提交的表单的参数;
然后在程序中想这个uri提交正确的表单,得到response,并得到Cookie
③按照浏览器中这个网址指定的请求方式,在程序中确定正式网页的请求uri和请求方式
④添加必要的几个请求头(例如Cookie、User-Agent等)
⑤获取到响应,并解析
完工!
简单的HttpClient使用模板
如果是采用模拟程序获取Cookie的话,通常是显示用Post提交表单,获取到Cookie,然后在通过Get方式携带Cookie,爬取到想要的网页信息
Get方式请求
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
|
public class HttpGetParamTest { public static void main(String[] args) throws Exception {
CloseableHttpClient httpClient = HttpClients.createDefault();
URIBuilder uriBuilder = new URIBuilder("https://mooc1.chaoxing.com/exam/test/reVersionPaperMarkContentNew?courseId=222650590" + "&classId=50617144&p=1&id=57211393&ut=s&cpi=95361626&newMooc=true&openc=cfd40dfe5645b94bd7f0740626b30614");
HttpGet httpGet = new HttpGet(uriBuilder.build()); httpGet.addHeader("Cookie", "两种方式获取到的Cookie");
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100" + ".0.4896.127 Safari/537.36 Edg/100.0.1185.50");
CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf-8"); System.out.println(content); System.out.println(WriteToFile.write(content)); } } catch (IOException e) { e.printStackTrace(); } finally { try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
|
Post方式请求
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
|
public class HttpPostParamTest { public static void main(String[] args) throws UnsupportedEncodingException { CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost httpPost = new HttpPost("https://mooc1.chaoxing.com/exam/test/reVersionPaperMarkContentNew"); List<NameValuePair> params = new ArrayList<NameValuePair>(); params.add(new BasicNameValuePair("courseId", "222650590")); params.add(new BasicNameValuePair("cpi", "95361626")); params.add(new BasicNameValuePair("newMooc", "true")); params.add(new BasicNameValuePair("openc", "cfd40dfe5645b94bd7f0740626b30614")); UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf-8"); httpPost.setEntity(formEntity);
CloseableHttpResponse response = null; try { response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf-8"); System.out.println("========================响应数据长度:" + content.length()); }
} catch (IOException e) { e.printStackTrace(); } finally { try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
|