Here’s a bit of code from a class project.

In about two months, everyone here will be searching through job boards to find the next steps in our careers. For this project we want to find a way to predict data science salaries in Boston.

We scanned Indeed.com and scraped information related to features such as job title, years of experience, company, and keywords like Python. Only a few listings had salary information or descriptions we could search through. Therefore we found most of our data by manipulating the search query, and trusting Indeed’s backend processing to deliver relevant results.

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
# String format: "keyword keyword"
def count_results(query_string = None):
    if query_string == None:
        return(print("No keyword entered."))

    query = "%20OR%20".join(query_string.split(" "))

    job_ids = pd.DataFrame()

    result_list = []

    # Find the number of results
    URL_for_count = "http://www.indeed.com/jobs?q=data+scientist+%28{}%29&l=Boston".format(query)
    soup_for_count = BeautifulSoup(urlopen(URL_for_count).read(), 'html.parser')

    results_number = soup_for_count.find("div", attrs = {"id": "searchCount"}).text
    number_of_results = int(results_number.split(sep = ' ')[-1].replace(',', ''))

    # Now loop through the pages. Viewing 100 results at a time means fewer page refreshes.
    i = int(number_of_results/100)
    for page_number in range(i + 1):
        URL_for_results = "http://www.indeed.com/jobs?q=data+scientist+%28{}%29&l=Boston&limit=100&start={}".format(query, str(100 * page_number))
        soup_for_results = BeautifulSoup(urlopen(URL_for_results).read(), 'html.parser')
        results = soup_for_results.find_all('div', attrs={'data-tn-component': 'organicJob'})

        # Extract the ID for each job listing
        for x in results:
            result_list.append([x.find('h2', attrs={"class": "jobtitle"})['id'], 1])

        # Add the job ID numbers
        job_ids = job_ids.append(result_list)

    # Rename job_ids's columns
    job_ids.columns = ['id', "{}".format(" OR ".join(query_string.split(" ")))]

    # Remove re-posted jobs
    job_ids.drop_duplicates(inplace = True)
    return (job_ids)
    #job_ids.to_csv(path_or_buf="id_and_{}.csv".format(query))
count_results()
No keyword entered.
count_results("PHD ph.d")
id PHD OR ph.d
0 jl_033222419605ed3f 1
1 jl_d5e17d142783f070 1
2 jl_32062be6a68c7531 1
3 jl_5cacf17ffc563847 1
4 jl_81117a845679ae80 1
5 jl_174d886f23f3d05e 1
6 jl_0bb820bd8ae6e87b 1
7 jl_7896e61d3e45dfb4 1
8 jl_62948e9c407ca034 1
9 jl_5f30454eae8a42bd 1
10 jl_65d62034685dd0ed 1
11 jl_04a882314da504d4 1
12 jl_6c9d0349b46d0aae 1
13 jl_10a1355277be089a 1
14 jl_ef56b76c21a4dd91 1
15 jl_6059a43bec56d8f4 1
16 jl_98bed206eb115e04 1
17 jl_44da2bd2b0b7e145 1
18 jl_9315b3a96ad20ac0 1
19 jl_e50079f12b84b9b6 1
20 jl_2370348e80d8c420 1
21 jl_765d5d6d1e3c30af 1
22 jl_770ee63428ff8a22 1
23 jl_0ea00cbc973dc761 1
24 jl_c888c568c29a71b7 1
25 jl_3b58a2212af3ba86 1
26 jl_216284a90c10e500 1
27 jl_151c68314992c211 1
28 jl_117c5543fad4be7e 1
29 jl_5dcae32cb98bfb9b 1
... ... ...
488 jl_ee99d2f1d59e54e5 1
489 jl_c8f54d2003b4e465 1
490 jl_689430243e6a7bea 1
491 jl_5002db1bbda7ff3b 1
492 jl_100daa4903ccb872 1
493 jl_ec512dddb179aae9 1
494 jl_8a3e91781bf16ddb 1
495 jl_2471538c70b44aaf 1
496 jl_506ead9021c7fd81 1
497 jl_d4a605cf31c4d6fa 1
498 jl_f91843d934a9d80f 1
499 jl_7ca7ec20e8a87bef 1
503 jl_9b476549089a6ff7 1
504 jl_9ffa35a19d0fab79 1
505 jl_bddee21cf5be8648 1
507 jl_29c0202cfc89d948 1
509 jl_56a526f9757fc61b 1
510 jl_d6baff3e82b1db8f 1
511 jl_fc1f842786c79ede 1
512 jl_6f1969f9bde9a93f 1
514 jl_7ff9a220edc5f335 1
515 jl_1960eac165fffe54 1
516 jl_e82d2fa409c92914 1
517 jl_849a440bd44fd715 1
518 jl_1130793f98655196 1
519 jl_a742c28826776c08 1
520 jl_7baab1e1b0e8e234 1
521 jl_7e2dba1124b29dcf 1
522 jl_57d36d880dc90113 1
523 jl_85017d6b3a30bdd3 1

501 rows × 2 columns