某特定のWebサイトをGETして必要なデータを取得するプログラム

だいぶ前に作成して使っていたが、少し変更したのでメモ。
前提として、Poi、Jsoupを使用する。

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class EconomicIndicatorWebScraping {
	public static int CRAWLLING_START_URL_ROWNUM = 1;//0から開始。2行目からなので、2。
	public static int CRAWLLING_END_URL_ROWNUM = 136;
	public static String INPUT_FILE_NAME = "C:\\opt\\work\\InPut\\crawllingEcoIndURL.xls";
	public static String OUTPUT_FILE_NAME = "C:\\opt\\work\\OutPut\\scrapingEcoIndResult.xls";
	static Logger logger = Logger.getLogger(EconomicIndicatorWebScraping.class.getName());

	public static void main(String[] args) {
		try {
			//プロキシ設定
			System.setProperty("proxySet", "true");
			System.setProperty("proxyHost", "プロキシサーバのIP");
			System.setProperty("proxyPort", "ポート番号");

			//スクロール対象URLの読み込み
			InputStream fileIn = new FileInputStream(INPUT_FILE_NAME);
			HSSFWorkbook wbInput = new HSSFWorkbook(fileIn);
			HSSFSheet sheetInput = wbInput.getSheet("crawllingURL");

			//結果書き込み用ワークブックを作成する
			HSSFWorkbook wbOutput = new HSSFWorkbook();
			OutputStream fileOut = new FileOutputStream(OUTPUT_FILE_NAME);

			//入力ファイルからクローリング対象URLを読み込んで、各URLに対してスクレイピングする
			for (int crawllingNum = CRAWLLING_START_URL_ROWNUM; crawllingNum <= CRAWLLING_END_URL_ROWNUM; crawllingNum++) {
				Row rowInput = sheetInput.getRow(crawllingNum);
				Cell urlCell = rowInput.getCell(3);
				String url = urlCell.getStringCellValue();

				if (!EconomicIndicatorWebScraping.checkURLFormat(url)) {
					String erroMsg = "URLデータがURL形式に沿っていません。" + "データ値:" + url + "行番号:" + crawllingNum;
					PropertyConfigurator.configure( "log4j.properties" );
					logger.error(erroMsg);
					continue;
				}

				System.out.println(urlCell.getStringCellValue());

				//結果書き込み用ワークシートの作成
				//HSSFSheet sheetOutput = wbOutput.createSheet(titleCell.getStringCellValue());
				HSSFSheet sheetOutput = wbOutput.createSheet();

				Document doc = Jsoup.connect(url).get();
				Elements table = doc.select("table");
				Elements announcedDates = table.select("td:nth-child(8n+1)");
				Elements announcedTimes = table.select("td:nth-child(8n+2)");
				Elements countryNames = table.select("td:nth-child(8n+3)");
				Elements indicatorNames = table.select("td:nth-child(8n+4)");
				Elements ranks = table.select("td:nth-child(8n+5)");
				Elements lastTimeResults = table.select("td:nth-child(8n+6)");
				Elements expectations = table.select("td:nth-child(8n+7)");
				Elements thisTimeResults = table.select("td:nth-child(8n+8)");

				int announcedDateTotalNum = announcedDates.size();
				for (int announcedNum = 0; announcedNum < announcedDateTotalNum; announcedNum++) {
					String announcedDate = announcedDates.get(announcedNum).text();
					String announcedTime = announcedTimes.get(announcedNum).text();
					String countryName = countryNames.get(announcedNum).text();
					String indicatorName = indicatorNames.get(announcedNum).text();
					String rank = ranks.get(announcedNum).text();
					String lastTimeResult = lastTimeResults.get(announcedNum).text();
					String expectation = expectations.get(announcedNum).text();
					String thisTimeResult = thisTimeResults.get(announcedNum).text();

					//行オブジェクトの作成(行番号は0スタート)
					HSSFRow row = sheetOutput.createRow(announcedNum);
					//セルオブジェクトの作成(セル番号は0スタート)
					//引数はshort型でキャストしなければならない点に注意
					HSSFCell announcedDateCell = row.createCell(1);
					HSSFCell announcedTimeCell = row.createCell(2);
					HSSFCell countryNameCell = row.createCell(3);
					HSSFCell indicatorNameCell = row.createCell(4);
					HSSFCell rankCell = row.createCell(5);
					HSSFCell lastTimeResultCell = row.createCell(6);
					HSSFCell expectationCell = row.createCell(7);
					HSSFCell thisTimeResultCell = row.createCell(8);

					announcedDateCell.setCellValue(announcedDate);
					announcedTimeCell.setCellValue(announcedTime);
					countryNameCell.setCellValue(countryName);
					indicatorNameCell.setCellValue(indicatorName);
					rankCell.setCellValue(rank);
					lastTimeResultCell.setCellValue(lastTimeResult);
					expectationCell.setCellValue(expectation);
					thisTimeResultCell.setCellValue(thisTimeResult);
				}
			}
			//作成したワークブックを保存する
			wbOutput.write(fileOut);
			fileOut.close();
			fileIn.close();
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch bklock
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	/**
	 *
	 * @param url URLフォーマットチェック対象文字列
	 * @return true:URLの形式にマッチしている場合。false:URLの形式にマッチしていない場合。
	 */
	public static boolean checkURLFormat(String url) {
		return (url.startsWith("http://") || url.startsWith("https://"));
	}
}