From 304112ccf17f074a43b2e125f2e31c5a9ec9ec6c Mon Sep 17 00:00:00 2001 From: anna1795 Date: Wed, 29 Nov 2023 16:36:22 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=BD=A6=E7=B3=BB=E5=9B=BE=E7=89=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 7 + src/main/java/cj/reptile/App.java | 158 +++++++++++++++++- .../maven/cj/cj-reptile/pom.properties | 2 +- .../META-INF/maven/cj/cj-reptile/pom.xml | 7 + target/classes/cj/reptile/App.class | Bin 535 -> 4461 bytes 5 files changed, 167 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index d3a1d5b..dffb9c9 100644 --- a/pom.xml +++ b/pom.xml @@ -15,6 +15,13 @@ + + + org.jsoup + jsoup + 1.10.2 + + junit junit diff --git a/src/main/java/cj/reptile/App.java b/src/main/java/cj/reptile/App.java index f9469c1..7f85aa5 100644 --- a/src/main/java/cj/reptile/App.java +++ b/src/main/java/cj/reptile/App.java @@ -1,13 +1,159 @@ package cj.reptile; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.URLConnection; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.Connection; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + /** * Hello world! * */ -public class App -{ - public static void main( String[] args ) - { - System.out.println( "Hello World!" ); - } +public class App { + + public static String pp(String input) { + // 定义正则表达式 + String regex = "\\d+"; + + // 创建Pattern对象 + Pattern pattern = Pattern.compile(regex); + + // 创建Matcher对象 + Matcher matcher = pattern.matcher(input); + + // 查找匹配的内容 + while (matcher.find()) { + String match = matcher.group(); + return match; + } + return ""; + } + + public static String urlName(String filename) { + filename = filename.replaceAll(":", ""); + filename = filename.replaceAll("\\*", ""); + filename = filename.replaceAll("\\?", ""); + filename = filename.replaceAll("<", ""); + filename = filename.replaceAll(">", ""); + filename = filename.replaceAll("|", ""); + return filename; + } + + public static void main(String[] args) throws IOException, URISyntaxException { + /* + * String folderPath = "D:\\db"; File folder = new File(folderPath); + * + * if (!folder.exists()) { if (folder.mkdirs()) { + * System.out.println("文件夹创建成功!"); } else { System.out.println("文件夹创建失败!"); } } + * else { System.out.println("文件夹已存在!"); } + */ + + /* + * if(src != null) { InputStream inputStream = getFileInputStream("https:"+src); + * ByteArrayOutputStream bytestream = new ByteArrayOutputStream(); int ch;while + * ((ch = inputStream.read()) != -1) { bytestream.write(ch); } String fileName = + * BaseInfoS.getFileName(bytestream.toByteArray(), + * src.substring(src.lastIndexOf("/") + 1)); // bytestream.toByteArray() + * inputStream.close(); bytestream.close(); + * System.out.println("fileName:"+fileName); brand.put("brand_logo", fileName); + * } + */ + + String letter = ""; + for (int a = 0; a < 26; a++) { + char chr = (char) ((int) 'A' + a); + letter = chr + ""; + Connection connect = Jsoup.connect("https://www.autohome.com.cn/grade/carhtml/" + letter + ".html"); + Document document = connect.get(); + String sp = ""; + Elements els = document.select("dl"); + for (int i = 0; i < els.size(); i++) { + Element e = els.get(i); + String pai = e.select("dt div a").text(); + String src = e.select("dt img").attr("src"); + + // Elements nodes = e.select("dd ul a"); + Elements nodes = e.select("dd h4 a"); + for (int j = 0; j < nodes.size(); j++) { + Element s = nodes.get(j); + // s.attr("href") + // href="//www.autohome.com.cn/5998/#levelsource=000000000_0&pvareaid=101594" + String f = s.text(); + + pai = urlName(pai); + f = urlName(f); + String path = "D:\\db\\" + pai + "\\" + f; + + + // path = path.replaceAll(":", ""); + + + File folder = new File(path); + + + // folder.mkdirs(); + // atk_5998 + + Connection href = Jsoup.connect("https:" + s.attr("href")); + Document documentt = href.get(); + Elements ee = documentt.select(".pic-main a"); + String imgsrc = ee.select("img").attr("src"); +// Element ee = e.getElementById("atk_"+pp(s.attr("href"))); +// path += "\\"; + if (imgsrc != null && imgsrc.length() > 0) { + String url="https:" +imgsrc; + + String savePath=path; + String filename=pai+f; + folder.mkdirs(); + download(url,savePath,filename.hashCode()+".jpg"); + + } + //System.out.println(imgsrc); + + + } + } + } + } + + public static void download(String urlString, String savePath, String filename) throws IOException { + System.out.println(urlString); + System.out.println(savePath+filename); + // 构造URL + URL url = new URL(urlString); + // 打开连接 + URLConnection con = url.openConnection(); + // 设置请求超时为20s + con.setConnectTimeout(20 * 1000); + // 文件路径不存在 则创建 + File sf = new File(savePath); + if (!sf.exists()) { + sf.mkdirs(); + } + InputStream in = con.getInputStream(); + OutputStream out = new FileOutputStream(sf.getPath() + "\\" + filename); + // 创建缓冲区 + byte[] buff = new byte[1024]; + int n; + // 开始读取 + while ((n = in.read(buff)) >= 0) { + out.write(buff, 0, n); + } + + } + } diff --git a/target/classes/META-INF/maven/cj/cj-reptile/pom.properties b/target/classes/META-INF/maven/cj/cj-reptile/pom.properties index 71bb959..fced898 100644 --- a/target/classes/META-INF/maven/cj/cj-reptile/pom.properties +++ b/target/classes/META-INF/maven/cj/cj-reptile/pom.properties @@ -1,5 +1,5 @@ #Generated by Maven Integration for Eclipse -#Wed Nov 29 11:52:31 CST 2023 +#Wed Nov 29 14:44:11 CST 2023 m2e.projectLocation=D\:\\code\\reptile m2e.projectName=cj-reptile groupId=cj diff --git a/target/classes/META-INF/maven/cj/cj-reptile/pom.xml b/target/classes/META-INF/maven/cj/cj-reptile/pom.xml index d3a1d5b..dffb9c9 100644 --- a/target/classes/META-INF/maven/cj/cj-reptile/pom.xml +++ b/target/classes/META-INF/maven/cj/cj-reptile/pom.xml @@ -15,6 +15,13 @@ + + + org.jsoup + jsoup + 1.10.2 + + junit junit diff --git a/target/classes/cj/reptile/App.class b/target/classes/cj/reptile/App.class index 15b3bb3b707427818a42fff3b693e35e441c9940..adf6b8c03220efa9d9e7f8278fd2ff6d41e0cdb7 100644 GIT binary patch literal 4461 zcmai2349c175{(PO=fnPTuT-hu7+0ABtVuH1iDZP1kk_|jsP_fDZ}oN-LO}8XA{7- zBE*&!DLrT_&{nBctS7Cy&{Au?Q0rBz;#u!%)q1qvT7&;@W|v(^s_@Gk-@N1h-uu7z zeaSOJPdo+S3Q?z^LZBwx7s!}tD;75ci_&QYZh^W!qu&U`jbv|N)z&^UYzb7&k0oQ) z0s&WZ%LV~=dn#gj;etm)C8`8!L$RdUnM-UnGwY14aZ|t-N`;O11|t)b?}bIT6^&)- zAatSEc>)#bv_MmHsN}lK%EXer^IC>4si-HAp2$=$M#Hb5L13({mtz==rq|pNSZ!FA znMrCGgRufiIF(2vLu@Y!<^yIjI+tm99bPX`*_%n_n4*T}5fgH`04Nko zrgN;8Z+Oo5(M}BGNIR(!&>gWdw1up(9>c#CBcUZb6SS1e#5;|Isgg|)K`&}z8fIgTfR{{*8)0)%JT5S|Jd1^5563-Xv{#WYJ*_GvZ+fm#aW&n~S8)w*3sfxP z?FIpL8w;K!JmxfE#F7GInm3K$fOPgQ-4T{OkV)3eAaAR@iw z;vyjlykAzgqrCpku$`x12Z>bDbwVlFSFwvVk64Y7SbwA8#Q;*$;f)$@l5B8W<_=56 zZsMm*EYYiC4-=Hlgr(G=hIgQrQk(!86}PY|kw{~7mSgBfFGt928Um&aa9RgahF_y8nyf}z8D($+1M^Qn2by>_ruX6%q2T1at~wj4gPTYxWZBS*3L;*#`Lmw zv4J{_7Z_hAb5S?26|&q*TxOikCzQ!uL{rXJ!Tu6Ns+UP!KuF5C(gvj}?PW^$@rQN= zyVlLUhE=WFmXbq|rwu1BoUD>SRP{s(9m;m8FzKg?^dMVll$#6V$r}A8ccrKY-@o@QdWbJLSC;WIESeC4hfg<)!b}>+Tc2rV|u$3{SQdClN~{R9r@5P4MG#Oh7Xxqt$je z%Q4E87V5d|^%lBR31}*&(NYc1w&O9K-zM;A#Bi zWq6Z}*)@$(={)J)?rRFUu0AgI)GeC0<*o&%vAlcIDTGd9bsk+I`Rd!im#KNY+1JAh zS5INHOyuJr;ewUEreKv`>FdiQ9#mYj)#IX7SNv)onWr%j^ynVn&OF|l#|=R*pWnu3 z-`n%p8yv;AoB0;hT(hf><3(NbR~M9O^qM^O1#5NhDcnl6T6wFrvFs-n-7CQy;6<-> z5C!W_<6yU5cM^97>utHaq+E^fP#*V|K>Ku`1A4SR+IOD>+@Lr34(D;yf#lcyXV9bj z^SJ*(tSi~!f{Z&jMz0HwmCF*?d!&b=|8aVK9`C(YAEP_p$K~<CCcB1OIfiMWKM`oTS<;}@^_e3n1j_= zh&8O~TCR5S|2m!**JC?2U?;kD5(lu?X4R_>TkuKbWfzG3I7xo7cf~%O z!sD_FL_eOuX(FB@R?_2>?1F`22|k6V$h9S6AwErwDy$T9@H9R{7Oobv@LB4rxK512 z=kR&*dOJ~lfpT6(GsI|G=zR|npP`04bG?EuQlpyjoyC`^QNt*|fiF|8mNF0H89vo9 z(!=-)zKVLrdlT*<79VZo@HJu?O}lY?o%R~&YcsW5;AajN;8{M6VJ3pqZ{aVy%h+48 zi^dTb4=;r-ax>~>vWJAhxa4>``WT+$sQDgmk{z5qk{r{V!gKvJp6}*B`-vPyKb5Uh z-`SYQFC}NEO3GI1m3jR32t4vm7P<5IeX!~@UhcNZ#>GE7CT-Fe?4HEmf=ay`PpU!n zII49;S6zNj9)H)Xbavl=x@UXtKlf6JPz#tK;AR591()DfjFOE>+Fs~2TBuLxWqr%0c$bU!eM9I;1)J{|dt!DA8aNA=rAu2_c zBmsXfQdo>i+FmA9;UNijlHO;9muPnLF9M@j2$kNvqMF6^;6DEC#CZ~;hSn=Y?SBD( CTW5Fx delta 203 zcmaE>G@XU()W2Q(7#J7~8I&e+^>VS~CT8X_GKgtJPiABk<_zOx5M&T$XAqjapF@;c zgh6!jbq+aM4hC^X1|E;poSb}x@cg2j6h$ruNd{?l1}R1c;mHp