## create Walter the Wikipedia Web Scraper in a method def walter(year): ## create html and open for bs html = 'https://en.wikipedia.org/wiki/Category:' + str(year) + '_initial_public_offerings' html_open = urlopen(html) ## initialize webdriver and go to html site ## reference driver #global driver driver = webdriver.Chrome(ChromeDriverManager().install()) driver.get(html) ## get the html source of the current page (will be the page listing all the IPOs for that year) soup_year = bs(html_open) ## get container of all the companies. Under html "div" container = soup_year.find("div", {"class": "mw-category"}) ## find all the companies ('c') on page c1 = container.find_all("li") ## loop through c1 and use 'get_text' function to put all companies into a list ('c2') c2 = [] ## create a list of dictionaries to keep the data in lst_dict = [] ## create a value to keep track of companies passed in cy. Create value to track exceptions. Reference global passed value passed_cy = [] exceptions = 0 global passed for i in range(0, len(c1)): name = c1[i].get_text() c2.append(name) ## loop through company list (c2) and click on each company in list to visit company's wikipedia page for x in c2: ## initialize variables of interest with np.NaN. If variable is not available it ## will be still be included in df. industry = np.NaN founded = np.NaN headquarters = np.NaN ticker = np.NaN try: ## click on company link driver.find_element_by_link_text(x).click() ## slow down crawler by 5 seconds sleep(5) try: ## get html source of page soup_company = bs(driver.page_source, 'html.parser') ## get company's info table table = soup_company.find("table", {"class": "infobox vcard"}) ## get all rows from the table rows = table.find_all("tr") ##create values list to keep track of points of interest vals = [x, year] ## loop through all rows and grab 'industry', 'founded', 'hq' ## in info unavailable, variable will be np.NaN for i in range(0, len(rows)): ## use try for exception handling try: ## get header of variable and match to point of interest header = rows[i].find("th").get_text() if header == 'Industry': industry = rows[i].find("td").get_text() elif header == 'Founded': founded = rows[i].find("td").get_text() elif header == 'Headquarters': headquarters = rows[i].find("td").get_text() elif header == 'Traded as': ticker = rows[i].find("td").get_text() else: ## if row does not match then skip pass except: pass except: passed.append(x) exceptions += 1 pass except: passed.append(x) exceptions += 1 pass ## add variables to vals list and append to running list dictionary vals.append(ticker) vals.append(industry) vals.append(founded) vals.append(headquarters) lst_dict.append(dict(zip(cols, vals))) ## go back one page to listing of companies driver.execute_script("window.history.go(-1)") ## add in delay before going to next company sleep(5) ## add in message that year is complete and how many companies were passed print("Walter complete for year " + str(year) + ". Number of companies passed was: " + str(exceptions)) ## close web browser #driver.close() ## quit web browser driver.quit() return lst_dict