[Java] 用java实现的电影天堂,飘花电影网的电影的下载地址抓取
1.之前看了一些论坛上有一个坛友用python写的抓取电影下载链接的,于是心血来潮的我也打算用java来写一个!其实并不是很难,下面附上代码这是对电影天堂的电影的抓取的方法,(在此期间尝试设置代{过}{滤}理,以及用线程池,但貌似均没有成功) 说明下主要的jar包主要有httpclient4.5以及jsoup1.71.[Java] 纯文本查看 复制代码?01020304050607080...
·
1.之前看了一些论坛上有一个坛友用python写的抓取电影下载链接的,于是心血来潮的我也打算用java来写一个!其实并不是很难,下面附上代码
这是对电影天堂的电影的抓取的方法,(在此期间尝试设置代{过}{滤}理,以及用线程池,但貌似均没有成功) 说明下主要的jar包主要有httpclient4.5以及jsoup1.7
1.
2.下面的飘花电影网的,其实可以看到爬取的过程是大同小异的,只是选择器有所差别而已
最后附上成功的截图
最后一张是在网页上的应用
这是对电影天堂的电影的抓取的方法,(在此期间尝试设置代{过}{滤}理,以及用线程池,但貌似均没有成功) 说明下主要的jar包主要有httpclient4.5以及jsoup1.7
1.
[Java] 纯文本查看 复制代码
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
package downloade;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.sun.corba.se.spi.orbutil.threadpool.ThreadPool;
import Pojo.DyUrl;
import dao.JDBCUtils;
public class Dyttdownload {
static int id= 1 ;
public static HttpClient client= null ;
public static void main(String[] args) {
//ExecutorService fixedThreadPool = Executors.newFixedThreadPool(10);
Map<Integer,String> map= new HashMap<>();
for ( int i= 1 ;i< 50 ;i++){
// http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html
map.put(i, "http://www.ygdy8.net/html/gndy/dyzz/list_23_" +i+ ".html" );
}
for (String string : map.values()) {
getUrl(string);
// Thread.currentThread().sleep(2000);
}
// getDownloadUrl("http://www.ygdy8.net/html/gndy/dyzz/20170926/55094.html");
}
public static void getUrl(String uri){
JDBCUtils utils= new JDBCUtils();
try {
client=HttpClients.createDefault();
// RequestConfig config=RequestConfig.custom().setProxy(new HttpHost("110.73.14.161",8123)).build();
HttpGet get= new HttpGet(uri);
// get.setConfig(config);
HttpResponse response=client.execute(get);
String result =EntityUtils.toString(response.getEntity(), "gb2312" );
Document doc=Jsoup.parse(result);
//css选择器
Elements elements= doc.select( "table.tbspan " );
for (Element element : elements) {
element.setBaseUri( "http://www.ygdy8.net" );
DyUrl dy=getDownloadUrl(element.select( "tr" ).get( 1 ).select( "a" ).text(),element.select( "tr" ).get( 1 ).select( "a" ).attr( "abs:href" ));
dy.setId(id);
utils.insert(dy);
id++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static DyUrl getDownloadUrl(String name,String dyurl){
DyUrl dy= new DyUrl();
// RequestConfig config=RequestConfig.custom().setProxy(new HttpHost("110.73.14.161",8123)).build();
try {
client=HttpClients.createDefault();
HttpGet get = new HttpGet(dyurl);
// get.setConfig(config);
HttpResponse response=client.execute(get);
String result =EntityUtils.toString(response.getEntity(), "gb2312" );
Document doc=Jsoup.parse(result);
Elements elements=doc.select( "div#Zoom table tr td a " );
dy.setDyname(name);
dy.setDyUrl(elements.get( 0 ).text());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return dy;
}
}
|
2.下面的飘花电影网的,其实可以看到爬取的过程是大同小异的,只是选择器有所差别而已
[Java] 纯文本查看 复制代码
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
package downloade;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import Pojo.DyUrl;
import dao.JDBCUtils;
public class piaohuadownload {
static int id= 1 ;
public static HttpClient client= null ;
public static void main(String[] args) {
Map<Integer,String> map= new HashMap<>();
for ( int i= 16 ;i< 50 ;i++){
map.put(i, "http://www.piaohua.com/html/dongzuo/list_" +i+ ".html" );
}
for (String string : map.values()) {
System.out.println( "正在爬这个" +string+ "网页" );
// TODO Auto-generated method stub
getUrl(string);
}
}
public static void getUrl(String uri){
JDBCUtils utils= new JDBCUtils();
try {
client =HttpClientBuilder.create().build();
HttpResponse response=client.execute( new HttpGet(uri));
String result =EntityUtils.toString(response.getEntity(), "utf-8" );
Document doc=Jsoup.parse(result);
doc.setBaseUri( "http://www.piaohua.com" );
Elements elements=doc.select( "#list dl" );
for (Element element : elements) {
String name=element.select( "font" ).first().text();
String dyurl=element.select( "a" ).first().absUrl( "href" );
DyUrl dy=getDownloadUrl(name, dyurl);
dy.setId(id);
utils.insert(dy);
id++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static DyUrl getDownloadUrl(String name,String dyurl){
DyUrl dUrl= new DyUrl();
try {
client=HttpClients.createDefault();
HttpResponse response;
response = client.execute( new HttpGet(dyurl));
String result =EntityUtils.toString(response.getEntity(), "utf-8" );
Document doc=Jsoup.parse(result);
Elements elements=doc.select( "#showinfo" ).select( "a" );
dUrl.setDyname(name);
dUrl.setDyUrl(elements.first().text());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return dUrl;
}
}
|
最后附上成功的截图
最后一张是在网页上的应用
更多推荐
所有评论(0)