Pythonでプロ野球の個人成績一覧をJSONにして取得する
Page content
スコア速報と同じく、Pythonでスクレイピングして各チーム最新の個人成績を取得したい。
やること
プロ野球の個人成績を見れるサイトのhtmlから要素を引っこ抜き、個人成績ページから適当に情報をつまんでJSONにまとめる。
書いてみる
import requests
import json
from bs4 import BeautifulSoup
NAME_HI = -1
TEAM_H1 = -2
EXCEPT_TITLE = 1
EXCEPT_TITLE_HEADER = 2
EXCEPT_HEAD_CONTENT = 1
CHANCE_STR_DIVIDER = 3
PITCHER_DUMP_VAL = 1
HITTER_DUMP_VAL = 2
...
def request_soup(url):
res = requests.get(url)
res.raise_for_status()
return BeautifulSoup(res.content, 'html.parser')
def link_tail_list(url):
soup = request_soup(url)
table = soup.find('table')
td_player_list = table.find_all('td', class_='lt yjM')
return [pl.find('a').get('href') for pl in td_player_list]
def full_val(str_val):
if str_val == '-':
return '0'
return str_val
def basic_information(personal_soup):
name = personal_soup.find_all('h1')[NAME_HI].text.split('(')[0]
team = personal_soup.find_all('h1')[TEAM_H1].text
return {'Name': name, 'Team': team}
def confirm_pitcher_tables(tables):
"""
return basic and right/left records
"""
records_table = rl_table = None
for table in tables:
table_type = table.find('tr').text.replace('\n', '')
if table_type == '投手成績':
records_table = table
elif table_type == '左右打者別成績':
rl_table = table
return records_table, rl_table
def confirm_hitter_tables(tables):
"""
return basic, chance, right/left, count, runner records
"""
records_table = chance_table = rl_table = count_table = runner_table = None
for table in tables:
table_type = table.find('tr').text.replace('\n', '')
if table_type == '打者成績':
records_table = table
elif table_type == '得点圏成績':
chance_table = table
elif table_type == '左右投手別成績':
rl_table = table
elif table_type == 'カウント別成績':
count_table = table
elif table_type == '塁状況別成績':
runner_table = table
return records_table, chance_table, rl_table, count_table, runner_table
def dict_records(records_table):
rheader = [th.text for th in records_table.find_all('th')[EXCEPT_TITLE:]]
rbody = [full_val(td.text) for td in records_table.find_all('td')]
return dict(zip(rheader, rbody))
def chance_records(chance_table):
chheader_raw = [th.text for th in chance_table.find_all('th')]
chheader = [
chheader_raw[0][:CHANCE_STR_DIVIDER] + h
for h in chheader_raw[EXCEPT_HEAD_CONTENT:]
]
chbody = [full_val(td.text) for td in chance_table.find_all('td')]
return dict(zip(chheader, chbody))
def records_by_rl(rl_table, dump_val):
"""
dump_val: remove top contentof
pitcher: 1 ('打者')
hitter: 2 ('投手', '打席')
"""
rl_header = [th.text for th in rl_table.find_all('th')][dump_val:]
rl_trs = rl_table.find_all('tr')[EXCEPT_TITLE_HEADER:]
rl_records = {}
for rl_tr in rl_trs:
rl_text = rl_tr.find('td').text
rl_body = [full_val(td.text) for td in rl_tr.find_all('td')[dump_val:]]
if '右' in rl_text:
rl_records['対右'] = dict(zip(rl_header, rl_body))
elif '左' in rl_text:
rl_records['対左'] = dict(zip(rl_header, rl_body))
return rl_records
def records_by_count_or_runner(table_by):
header = [th.text for th in table_by.find_all('th')][EXCEPT_HEAD_CONTENT:]
body_tr = table_by.find_all('tr')[EXCEPT_TITLE_HEADER:]
records_by_count_or_runner = {}
for tr in body_tr:
situation = tr.find('td').text
body = [
full_val(td.text) for td in tr.find_all('td')[EXCEPT_HEAD_CONTENT:]
]
records_by_count_or_runner[situation] = dict(zip(header, body))
return records_by_count_or_runner
def append_team_pitcher_array(link_tail_list):
team_pitcher_list = []
for ptail in link_tail_list:
personal_link = BASEURL + ptail
personal_soup = request_soup(personal_link)
personal_dict = basic_information(personal_soup)
tables = personal_soup.find_all('table')
records_table, rl_table = confirm_pitcher_tables(tables)
records = dict_records(records_table)
if rl_table:
records_rl = records_by_rl(rl_table, PITCHER_DUMP_VAL)
records.update(records_rl)
personal_dict.update(records)
team_pitcher_list.append(personal_dict)
return team_pitcher_list
def append_team_hitter_array(link_tail_list):
team_hitter_list = []
for htail in link_tail_list:
personal_link = BASEURL + htail
personal_soup = request_soup(personal_link)
personal_dict = basic_information(personal_soup)
tables = personal_soup.find_all('table')
records_table, chance_table, rl_table, count_table, runner_table = confirm_hitter_tables(
tables)
records = dict_records(records_table)
if chance_table:
ch_records = chance_records(chance_table)
records.update(ch_records)
if rl_table:
records_rl = records_by_rl(rl_table, HITTER_DUMP_VAL)
records.update(records_rl)
if count_table:
records_by_count = records_by_count_or_runner(count_table)
records.update({'カウント': records_by_count})
if runner_table:
records_by_runner = records_by_count_or_runner(runner_table)
records.update({'走者': records_by_runner})
personal_dict.update(records)
team_hitter_list.append(personal_dict)
return team_hitter_list
def append_records_array():
pitcher_list = []
hitter_list = []
for i in TEAM_NUM_LIST:
purl = BASEURL + 'npb/teams/' + str(i) + '/memberlist?type=p'
hurl = BASEURL + 'npb/teams/' + str(i) + '/memberlist?type=b'
pit_link_tail_list = link_tail_list(purl)
hit_link_tail_list = link_tail_list(hurl)
team_pitcher_list = append_team_pitcher_array(pit_link_tail_list)
pitcher_list.extend(team_pitcher_list)
team_hitter_list = append_team_hitter_array(hit_link_tail_list)
hitter_list.extend(team_hitter_list)
return pitcher_list, hitter_list
pitcher_list, hitter_list = append_records_array()
with open('pitchers.json', 'w') as pf:
json.dump({'Pitcher': pitcher_list}, pf, indent=2, ensure_ascii=False)
with open('hitters.json', 'w') as hf:
json.dump({'Hitter': hitter_list}, hf, indent=2, ensure_ascii=False)
結果
以下のようなJSON
ファイルで保存される。
{
"Pitcher": [
{
"Name": "山口 俊",
"Team": "読売ジャイアンツ",
"Records": {
"防御率": "1.59",
"登板": "5",
"先発": "5",
"完投": "0",
"完封": "0",
"無四球": "0",
"QS": "4",
"交代完了": "0",
"勝利": "4",
"敗戦": "0",
"ホールド": "0",
"HP": "0",
"セーブ": "0",
"勝率": "1.000",
"投球回": "34",
"打者": "137",
"被安打": "19",
"被本塁打": "1",
"奪三振": "30",
"奪三振率": "7.94",
"与四球": "16",
"与死球": "4",
"暴投": "1",
"ボーク": "0",
"失点": "7",
"自責点": "6",
"被打率": ".168",
"K/BB": "1.88",
"WHIP": "1.03",
"対右": {
"被打率": ".082",
"被打数": "49",
"被安打": "4",
"被本塁打": "1",
"奪三振": "15",
"与四球": "10",
"与死球": "4"
},
"対左": {
"被打率": ".234",
"被打数": "64",
"被安打": "15",
"被本塁打": "0",
"奪三振": "15",
"与四球": "6",
"与死球": "0"
}
}
},
...
おわり
MLBはオープンデータがある程度提供されているが、日本だとそういうのは無さそう。
Webサイトのちょっとした仕様変更とかで動かなくなりそうなので、定期的に取るならメンテナンスが大変。
[参考記事]【WHIP, K/BB他】NPB(2019)セイバーメトリクス投手指標の算出①
[参考記事]【wOBA】NPB(2019)セイバーメトリクス野手指標の算出①