某特定のWebサイトをGETして必要なデータを取得するプログラム
だいぶ前に作成して使っていたが、少し変更したのでメモ。
前提として、Poi、Jsoupを使用する。
import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.MalformedURLException; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; public class EconomicIndicatorWebScraping { public static int CRAWLLING_START_URL_ROWNUM = 1;//0から開始。2行目からなので、2。 public static int CRAWLLING_END_URL_ROWNUM = 136; public static String INPUT_FILE_NAME = "C:\\opt\\work\\InPut\\crawllingEcoIndURL.xls"; public static String OUTPUT_FILE_NAME = "C:\\opt\\work\\OutPut\\scrapingEcoIndResult.xls"; static Logger logger = Logger.getLogger(EconomicIndicatorWebScraping.class.getName()); public static void main(String[] args) { try { //プロキシ設定 System.setProperty("proxySet", "true"); System.setProperty("proxyHost", "プロキシサーバのIP"); System.setProperty("proxyPort", "ポート番号"); //スクロール対象URLの読み込み InputStream fileIn = new FileInputStream(INPUT_FILE_NAME); HSSFWorkbook wbInput = new HSSFWorkbook(fileIn); HSSFSheet sheetInput = wbInput.getSheet("crawllingURL"); //結果書き込み用ワークブックを作成する HSSFWorkbook wbOutput = new HSSFWorkbook(); OutputStream fileOut = new FileOutputStream(OUTPUT_FILE_NAME); //入力ファイルからクローリング対象URLを読み込んで、各URLに対してスクレイピングする for (int crawllingNum = CRAWLLING_START_URL_ROWNUM; crawllingNum <= CRAWLLING_END_URL_ROWNUM; crawllingNum++) { Row rowInput = sheetInput.getRow(crawllingNum); Cell urlCell = rowInput.getCell(3); String url = urlCell.getStringCellValue(); if (!EconomicIndicatorWebScraping.checkURLFormat(url)) { String erroMsg = "URLデータがURL形式に沿っていません。" + "データ値:" + url + "行番号:" + crawllingNum; PropertyConfigurator.configure( "log4j.properties" ); logger.error(erroMsg); continue; } System.out.println(urlCell.getStringCellValue()); //結果書き込み用ワークシートの作成 //HSSFSheet sheetOutput = wbOutput.createSheet(titleCell.getStringCellValue()); HSSFSheet sheetOutput = wbOutput.createSheet(); Document doc = Jsoup.connect(url).get(); Elements table = doc.select("table"); Elements announcedDates = table.select("td:nth-child(8n+1)"); Elements announcedTimes = table.select("td:nth-child(8n+2)"); Elements countryNames = table.select("td:nth-child(8n+3)"); Elements indicatorNames = table.select("td:nth-child(8n+4)"); Elements ranks = table.select("td:nth-child(8n+5)"); Elements lastTimeResults = table.select("td:nth-child(8n+6)"); Elements expectations = table.select("td:nth-child(8n+7)"); Elements thisTimeResults = table.select("td:nth-child(8n+8)"); int announcedDateTotalNum = announcedDates.size(); for (int announcedNum = 0; announcedNum < announcedDateTotalNum; announcedNum++) { String announcedDate = announcedDates.get(announcedNum).text(); String announcedTime = announcedTimes.get(announcedNum).text(); String countryName = countryNames.get(announcedNum).text(); String indicatorName = indicatorNames.get(announcedNum).text(); String rank = ranks.get(announcedNum).text(); String lastTimeResult = lastTimeResults.get(announcedNum).text(); String expectation = expectations.get(announcedNum).text(); String thisTimeResult = thisTimeResults.get(announcedNum).text(); //行オブジェクトの作成(行番号は0スタート) HSSFRow row = sheetOutput.createRow(announcedNum); //セルオブジェクトの作成(セル番号は0スタート) //引数はshort型でキャストしなければならない点に注意 HSSFCell announcedDateCell = row.createCell(1); HSSFCell announcedTimeCell = row.createCell(2); HSSFCell countryNameCell = row.createCell(3); HSSFCell indicatorNameCell = row.createCell(4); HSSFCell rankCell = row.createCell(5); HSSFCell lastTimeResultCell = row.createCell(6); HSSFCell expectationCell = row.createCell(7); HSSFCell thisTimeResultCell = row.createCell(8); announcedDateCell.setCellValue(announcedDate); announcedTimeCell.setCellValue(announcedTime); countryNameCell.setCellValue(countryName); indicatorNameCell.setCellValue(indicatorName); rankCell.setCellValue(rank); lastTimeResultCell.setCellValue(lastTimeResult); expectationCell.setCellValue(expectation); thisTimeResultCell.setCellValue(thisTimeResult); } } //作成したワークブックを保存する wbOutput.write(fileOut); fileOut.close(); fileIn.close(); } catch (MalformedURLException e) { // TODO Auto-generated catch bklock e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * * @param url URLフォーマットチェック対象文字列 * @return true:URLの形式にマッチしている場合。false:URLの形式にマッチしていない場合。 */ public static boolean checkURLFormat(String url) { return (url.startsWith("http://") || url.startsWith("https://")); } }