Pythonで日経平均のWebスクレイピングをやってみた

Pythonにはスクレイピングのためのライブラリが幾つか用意されています。
・urllib（標準ライブラリ）
・requests（https://pypi.org/project/requests/2.7.0/）
・BeautifulSoup（https://pypi.org/project/beautifulsoup4/）
など

ここでは、BeautifulSoupを利用して日経平均株価を取得したいと思います。
日本経済新聞のサイトからデータを取得します。
（2019年11月4日時点ではスクレイピング禁止にはなっていません。）
https://www.nikkei.com/markets/worldidx/chart/nk225/

クローリングによって岡崎市立中央図書館事件のようなことが起こりませんように。。。
このサンプルスクリプトでは、Webサイトにアクセスするのは1回のみのため、大丈夫とは思います。
https://ja.wikipedia.org/wiki/%E5%B2%A1%E5%B4%8E%E5%B8%82%E7%AB%8B%E4%B8%AD%E5%A4%AE%E5%9B%B3%E6%9B%B8%E9%A4%A8%E4%BA%8B%E4%BB%B6

その日の始値、終値、高値、安値などを取得してcsvファイルに保存します。
以下のように保存されます。

f:id:shinryok:20191105000221p:plain — nikkei225_csv

実行したのは以下のコードです。

import urllib.request as urllib2
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import time
import re
import os

def getNikkeiHeikin():
    url = "https://www.nikkei.com/markets/worldidx/chart/nk225/"
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    allTags = soup.find_all({'th': True, 'td': True, 'tr': True, 'p': True, 'span': True})
    nikkei225 = {}
    flg = False
    for tag in allTags:
        #print(tag)
        if tag.get("class"):
            if tag.get("class")[0] == 'economic_value_time' and re.search("[0-9]+.大引\)$", tag.text):
                nikkei225["date"] = tag.text
                print("Date = " + nikkei225["date"])
            if re.search("^始値", tag.text):
                flg = "始値"
                continue
            if flg == "始値":
                if tag.get("class")[0] == 'm-trend_economic_table_value':
                    nikkei225["openingQuotation"] =  str(tag.text)
                    print("始値 = " + nikkei225["openingQuotation"])
                    flg = False
            if re.search("^高値", tag.text):
                flg = "高値"
                continue
            if flg == "高値":
                if tag.get("class")[0] == 'm-trend_economic_table_value':
                    nikkei225["highPrice"] = tag.text
                    print("高値 = " + nikkei225["highPrice"])
                    flg = False
                elif tag.get("class")[0] == 'm-trend_economic_table_time':
                    nikkei225["highTime"] = tag.text
                    print("高値_時間 = " + nikkei225["highTime"])
            if re.search("^安値", tag.text):
                flg = "安値"
                continue
            if flg == "安値":
                if tag.get("class")[0] == 'm-trend_economic_table_value':
                    nikkei225["lowPrice"] = tag.text
                    print("安値 = " + nikkei225["lowPrice"])
                    flg = False
                elif tag.get("class")[0] == 'm-trend_economic_table_time':
                    nikkei225["lowTime"] = tag.text
                    print("安値_時間 = " + nikkei225["lowTime"])
            if re.search("^年初来高値", tag.text):
                flg = "年初来高値"
                continue
            if flg == "年初来高値":
                if tag.get("class")[0] == 'm-trend_economic_table_value':
                    nikkei225["yearHighPrice"] = tag.text
                    print("年初来高値 = " + nikkei225["yearHighPrice"])
                    flg = False
            if re.search("^年初来安値", tag.text):
                flg = "年初来安値"
                continue
            if flg == "年初来安値":
                if tag.get("class")[0] == 'm-trend_economic_table_value':
                    nikkei225["yearLowPrice"] = tag.text
                    print("年初来安値 = " + nikkei225["yearLowPrice"])
                    flg = False
            if tag.get("class")[0] == 'economic_value_now':
                nikkei225["closingQuotation"] = tag.text
                #print(tag, tag.text, tag.get("class"))
                print("終値 = " + nikkei225["closingQuotation"])
    return nikkei225


def getNikkeiIndex():
    #url = "https://kabutan.jp/stock/kabuka?code=0000"
    url = "https://www.nikkei.com/markets/kabu/japanidx/"
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    # print(soup.title.string)
    allTags = soup.find_all({'th': True, 'td': True, 'tr': True, 'p': True, 'span': True})
    nikkei225_index = {}
    flg = False
    i = 0
    for tag in allTags:
        #print(tag)
        if re.search("^売買高$", tag.text):
            flg = "売買高"
            nikkei225_index["tradingVolume"] = []  # ["東証一部", "東証二部", "ジャスダック"]
            continue
        if flg == "売買高":
            nikkei225_index["tradingVolume"].append(tag.text)
            i += 1
            if i == 3:
                print("売買高 = " + str(nikkei225_index["tradingVolume"]))
                flg = False
                i = 0
        if re.search("^売買代金$", tag.text):
            flg = "売買代金"
            nikkei225_index["tradingValue"] = []  # ["東証一部", "東証二部", "ジャスダック"]
            continue
        if flg == "売買代金":
            nikkei225_index["tradingValue"].append(tag.text)
            i += 1
            if i == 3:
                print("売買代金 = " + str(nikkei225_index["tradingValue"]))
                flg = False
                i = 0
        if re.search("^売買単価$", tag.text):
            flg = "売買単価"
            nikkei225_index["unitPrice"] = []  # ["東証一部", "東証二部", "ジャスダック"]
            continue
        if flg == "売買単価":
            nikkei225_index["unitPrice"].append(tag.text)
            i += 1
            if i == 3:
                print("売買単価 = " + str(nikkei225_index["unitPrice"]))
                flg = False
                i = 0
        if re.search("^上場銘柄数$", tag.text):
            flg = "上場銘柄数"
            nikkei225_index["listedIssueNum"] = []  # ["東証一部", "東証二部", "ジャスダック"]
            continue
        if flg == "上場銘柄数":
            nikkei225_index["listedIssueNum"].append(tag.text)
            i += 1
            if i == 3:
                print("上場銘柄数 = " + str(nikkei225_index["listedIssueNum"]))
                flg = False
                i = 0
        if re.search("^値上がり銘柄数$", tag.text):
            flg = "値上がり銘柄数"
            nikkei225_index["risingStockNum"] = []  # ["東証一部", "東証二部", "ジャスダック"]
            continue
        if flg == "値上がり銘柄数":
            nikkei225_index["risingStockNum"].append(tag.text)
            i += 1
            if i == 3:
                print("値上がり銘柄数 = " + str(nikkei225_index["risingStockNum"]))
                flg = False
                i = 0
        if re.search("^値下がり銘柄数$", tag.text):
            flg = "値下がり銘柄数"
            nikkei225_index["decliningStockNum"] = []  # ["東証一部", "東証二部", "ジャスダック"]
            continue
        if flg == "値下がり銘柄数":
            nikkei225_index["decliningStockNum"].append(tag.text)
            i += 1
            if i == 3:
                print("値下がり銘柄数 = " + str(nikkei225_index["decliningStockNum"]))
                flg = False
                i = 0
    return nikkei225_index


def writeCSV_nikkei(nikkei225, nikkei225_index):
    print(nikkei225)
    print(nikkei225_index)
    fieldNames = ['Date', 'OpeningQuotation', 'ClosingQuotation', 'HighPrice', 'HighTime', 'LowPrice', 'LowTime',
                  'TradingVolume', 'TradingValue', 'UnitPrice', 'ListedIssueNum', 'RisingStockNum', 'DecliningStockNum',
                  'TradingVolume_J', 'TradingValue_J', 'UnitPrice_J', 'ListedIssueNum_J', 'RisingStockNum_J', 'DecliningStockNum_J']
    fname = "nikkei225.csv"
    fout = os.path.join(os.getcwd(), "nikkei225_csv", fname)
    if os.path.isfile(fout):
        with open(fout, 'a') as csvFile:
            writer = csv.DictWriter(csvFile, fieldnames=fieldNames, lineterminator='\n')
            try:
                writer.writerow(
                    {'Date': nikkei225["date"],
                     'OpeningQuotation': nikkei225["openingQuotation"],
                     'ClosingQuotation': nikkei225["closingQuotation"],
                     'HighPrice': nikkei225["highPrice"],
                     'HighTime': nikkei225["highTime"],
                     'LowPrice': nikkei225["lowPrice"],
                     'LowTime': nikkei225["lowTime"],
                     'TradingVolume': nikkei225_index["tradingVolume"][0],
                     'TradingValue': nikkei225_index["tradingValue"][0],
                     'UnitPrice': nikkei225_index["unitPrice"][0],
                     'ListedIssueNum': nikkei225_index["listedIssueNum"][0],
                     'RisingStockNum': nikkei225_index["risingStockNum"][0],
                     'DecliningStockNum': nikkei225_index["decliningStockNum"][0],
                     'TradingVolume_J': nikkei225_index["tradingVolume"][2],
                     'TradingValue_J': nikkei225_index["tradingValue"][2],
                     'UnitPrice_J': nikkei225_index["unitPrice"][2],
                     'ListedIssueNum_J': nikkei225_index["listedIssueNum"][2],
                     'RisingStockNum_J': nikkei225_index["risingStockNum"][2],
                     'DecliningStockNum_J': nikkei225_index["decliningStockNum"][2]
                     }
                )
            except:
                print("Error")
    else:
        with open(fout, 'w') as csvFile:
            writer = csv.DictWriter(csvFile, fieldnames=fieldNames, lineterminator='\n')
            writer.writeheader()
            writer.writerow(
                {'Date': nikkei225["date"],
                 'OpeningQuotation': nikkei225["openingQuotation"],
                 'ClosingQuotation': nikkei225["closingQuotation"],
                 'HighPrice': nikkei225["highPrice"],
                 'HighTime': nikkei225["highTime"],
                 'LowPrice': nikkei225["lowPrice"],
                 'LowTime': nikkei225["lowTime"],
                 'TradingVolume': nikkei225_index["tradingVolume"][0],
                 'TradingValue': nikkei225_index["tradingValue"][0],
                 'UnitPrice': nikkei225_index["unitPrice"][0],
                 'ListedIssueNum': nikkei225_index["listedIssueNum"][0],
                 'RisingStockNum': nikkei225_index["risingStockNum"][0],
                 'DecliningStockNum': nikkei225_index["decliningStockNum"][0],
                 'TradingVolume_J': nikkei225_index["tradingVolume"][2],
                 'TradingValue_J': nikkei225_index["tradingValue"][2],
                 'UnitPrice_J': nikkei225_index["unitPrice"][2],
                 'ListedIssueNum_J': nikkei225_index["listedIssueNum"][2],
                 'RisingStockNum_J': nikkei225_index["risingStockNum"][2],
                 'DecliningStockNum_J': nikkei225_index["decliningStockNum"][2]
                 }
            )

if __name__ == '__main__':
    nikkei225 = getNikkeiHeikin()
    nikkei225_index = getNikkeiIndex()
    writeCSV_nikkei(nikkei225, nikkei225_index)

見習いプログラマの学習日記

Pythonプログラムの学習記録

Pythonで日経平均のWebスクレイピングをやってみた