from bs4 import BeautifulSoup from urllib2 import urlopen import time import random year = 2013 batters=[] for line in open('C:\\Users\\Eric\\Dropbox\\IU\\Complex Systems Project- Filippo Radicchi\\Yearly Usernames\\2013usernames_nonpitchers5.txt','r'): batters.append(line.strip()) possible_pitchers=[] for line in open('C:\\Users\\Eric\\Dropbox\\IU\\Complex Systems Project- Filippo Radicchi\\Yearly Usernames\\2013usernames_pitchers.txt','r'): possible_pitchers.append(line.strip()) for batter in batters: print batter data = '' time.sleep(random.random()*10) url = 'http://www.baseball-reference.com/play-index/batter_vs_pitcher.cgi?year=0&n1='+batter+'&id='+batter+'&batter='+batter html = urlopen(url).read() soup = BeautifulSoup(html,'lxml') #get batter name title = soup.title.string end = title.find(' vs.') name = title[0:end] print name #get all pitchers batter has ever faced pitchers = [] body = soup.find('tbody') for opp in body.find_all('tr'): if 'thead' in opp.get('class'): pass else: attribute_zero = opp.find('td') link = attribute_zero.a.get('href') ind = link.find('pitcher=') pitcher = link[ind+8:len(link)] pitchers.append(pitcher) for p in range(0,len(pitchers)): print str(p+1)+' of '+str(len(pitchers)) pitcher = pitchers[p] if pitcher in possible_pitchers: time.sleep(random.random()*2) url = 'http://www.baseball-reference.com/play-index/batter_vs_pitcher.cgi?batter='+batter+'&pitcher='+pitcher html = urlopen(url).read() soup = BeautifulSoup(html,'lxml') #get pitcher name title=soup.title.string begin = title.find('vs. ')+4 end = title.find(' - Baseball-Reference.com') p_name = title[begin:end] #get 19 statistics body = soup.find('tbody') stats=[] for y in body.find_all('tr'): if 'thead' in y.get('class'): pass else: attributes = y.find_all('td') year_from_attributes = '' if attributes[1].string is not None: year_from_attributes = attributes[1].string.strip() elif attributes[1].get('csk') is not None: year_from_attributes = attributes[1].get('csk') else: print 'weird line in '+batter+' '+pitcher print attributes[1] if year_from_attributes == str(year): for j in range(2,21): stats.append(attributes[j].string) #add to data if stats != []: data+=p_name +',' for stat in stats: data+=str(stat)+',' data=data[0:len(data)-1] data+='\n' writer = open('C:\\Users\\Eric\\Dropbox\\IU\\Complex Systems Project- Filippo Radicchi\\2013 Batters\\'+name+'.txt','w') writer.write(data.strip()) writer.close()