Here’s a bit of code from a class project.
In about two months, everyone here will be searching through job boards to find the next steps in our careers. For this project we want to find a way to predict data science salaries in Boston.
We scanned Indeed.com and scraped information related to features such as job title, years of experience, company, and keywords like Python. Only a few listings had salary information or descriptions we could search through. Therefore we found most of our data by manipulating the search query, and trusting Indeed’s backend processing to deliver relevant results.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
# String format: "keyword keyword"
def count_results(query_string = None):
if query_string == None:
return(print("No keyword entered."))
query = "%20OR%20".join(query_string.split(" "))
job_ids = pd.DataFrame()
result_list = []
# Find the number of results
URL_for_count = "http://www.indeed.com/jobs?q=data+scientist+%28{}%29&l=Boston".format(query)
soup_for_count = BeautifulSoup(urlopen(URL_for_count).read(), 'html.parser')
results_number = soup_for_count.find("div", attrs = {"id": "searchCount"}).text
number_of_results = int(results_number.split(sep = ' ')[-1].replace(',', ''))
# Now loop through the pages. Viewing 100 results at a time means fewer page refreshes.
i = int(number_of_results/100)
for page_number in range(i + 1):
URL_for_results = "http://www.indeed.com/jobs?q=data+scientist+%28{}%29&l=Boston&limit=100&start={}".format(query, str(100 * page_number))
soup_for_results = BeautifulSoup(urlopen(URL_for_results).read(), 'html.parser')
results = soup_for_results.find_all('div', attrs={'data-tn-component': 'organicJob'})
# Extract the ID for each job listing
for x in results:
result_list.append([x.find('h2', attrs={"class": "jobtitle"})['id'], 1])
# Add the job ID numbers
job_ids = job_ids.append(result_list)
# Rename job_ids's columns
job_ids.columns = ['id', "{}".format(" OR ".join(query_string.split(" ")))]
# Remove re-posted jobs
job_ids.drop_duplicates(inplace = True)
return (job_ids)
#job_ids.to_csv(path_or_buf="id_and_{}.csv".format(query))
count_results()
No keyword entered.
count_results("PHD ph.d")
id | PHD OR ph.d | |
---|---|---|
0 | jl_033222419605ed3f | 1 |
1 | jl_d5e17d142783f070 | 1 |
2 | jl_32062be6a68c7531 | 1 |
3 | jl_5cacf17ffc563847 | 1 |
4 | jl_81117a845679ae80 | 1 |
5 | jl_174d886f23f3d05e | 1 |
6 | jl_0bb820bd8ae6e87b | 1 |
7 | jl_7896e61d3e45dfb4 | 1 |
8 | jl_62948e9c407ca034 | 1 |
9 | jl_5f30454eae8a42bd | 1 |
10 | jl_65d62034685dd0ed | 1 |
11 | jl_04a882314da504d4 | 1 |
12 | jl_6c9d0349b46d0aae | 1 |
13 | jl_10a1355277be089a | 1 |
14 | jl_ef56b76c21a4dd91 | 1 |
15 | jl_6059a43bec56d8f4 | 1 |
16 | jl_98bed206eb115e04 | 1 |
17 | jl_44da2bd2b0b7e145 | 1 |
18 | jl_9315b3a96ad20ac0 | 1 |
19 | jl_e50079f12b84b9b6 | 1 |
20 | jl_2370348e80d8c420 | 1 |
21 | jl_765d5d6d1e3c30af | 1 |
22 | jl_770ee63428ff8a22 | 1 |
23 | jl_0ea00cbc973dc761 | 1 |
24 | jl_c888c568c29a71b7 | 1 |
25 | jl_3b58a2212af3ba86 | 1 |
26 | jl_216284a90c10e500 | 1 |
27 | jl_151c68314992c211 | 1 |
28 | jl_117c5543fad4be7e | 1 |
29 | jl_5dcae32cb98bfb9b | 1 |
... | ... | ... |
488 | jl_ee99d2f1d59e54e5 | 1 |
489 | jl_c8f54d2003b4e465 | 1 |
490 | jl_689430243e6a7bea | 1 |
491 | jl_5002db1bbda7ff3b | 1 |
492 | jl_100daa4903ccb872 | 1 |
493 | jl_ec512dddb179aae9 | 1 |
494 | jl_8a3e91781bf16ddb | 1 |
495 | jl_2471538c70b44aaf | 1 |
496 | jl_506ead9021c7fd81 | 1 |
497 | jl_d4a605cf31c4d6fa | 1 |
498 | jl_f91843d934a9d80f | 1 |
499 | jl_7ca7ec20e8a87bef | 1 |
503 | jl_9b476549089a6ff7 | 1 |
504 | jl_9ffa35a19d0fab79 | 1 |
505 | jl_bddee21cf5be8648 | 1 |
507 | jl_29c0202cfc89d948 | 1 |
509 | jl_56a526f9757fc61b | 1 |
510 | jl_d6baff3e82b1db8f | 1 |
511 | jl_fc1f842786c79ede | 1 |
512 | jl_6f1969f9bde9a93f | 1 |
514 | jl_7ff9a220edc5f335 | 1 |
515 | jl_1960eac165fffe54 | 1 |
516 | jl_e82d2fa409c92914 | 1 |
517 | jl_849a440bd44fd715 | 1 |
518 | jl_1130793f98655196 | 1 |
519 | jl_a742c28826776c08 | 1 |
520 | jl_7baab1e1b0e8e234 | 1 |
521 | jl_7e2dba1124b29dcf | 1 |
522 | jl_57d36d880dc90113 | 1 |
523 | jl_85017d6b3a30bdd3 | 1 |
501 rows × 2 columns