Web Scraping practice: retrieving job IDs and keywords from Indeed.com

Here’s a bit of code from a class project.

In about two months, everyone here will be searching through job boards to find the next steps in our careers. For this project we want to find a way to predict data science salaries in Boston.

We scanned Indeed.com and scraped information related to features such as job title, years of experience, company, and keywords like Python. Only a few listings had salary information or descriptions we could search through. Therefore we found most of our data by manipulating the search query, and trusting Indeed’s backend processing to deliver relevant results.

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

# String format: "keyword keyword"
def count_results(query_string = None):
    if query_string == None:
        return(print("No keyword entered."))

    query = "%20OR%20".join(query_string.split(" "))

    job_ids = pd.DataFrame()

    result_list = []

    # Find the number of results
    URL_for_count = "http://www.indeed.com/jobs?q=data+scientist+%28{}%29&l=Boston".format(query)
    soup_for_count = BeautifulSoup(urlopen(URL_for_count).read(), 'html.parser')

    results_number = soup_for_count.find("div", attrs = {"id": "searchCount"}).text
    number_of_results = int(results_number.split(sep = ' ')[-1].replace(',', ''))

    # Now loop through the pages. Viewing 100 results at a time means fewer page refreshes.
    i = int(number_of_results/100)
    for page_number in range(i + 1):
        URL_for_results = "http://www.indeed.com/jobs?q=data+scientist+%28{}%29&l=Boston&limit=100&start={}".format(query, str(100 * page_number))
        soup_for_results = BeautifulSoup(urlopen(URL_for_results).read(), 'html.parser')
        results = soup_for_results.find_all('div', attrs={'data-tn-component': 'organicJob'})

        # Extract the ID for each job listing
        for x in results:
            result_list.append([x.find('h2', attrs={"class": "jobtitle"})['id'], 1])

        # Add the job ID numbers
        job_ids = job_ids.append(result_list)

    # Rename job_ids's columns
    job_ids.columns = ['id', "{}".format(" OR ".join(query_string.split(" ")))]

    # Remove re-posted jobs
    job_ids.drop_duplicates(inplace = True)
    return (job_ids)
    #job_ids.to_csv(path_or_buf="id_and_{}.csv".format(query))

count_results()

No keyword entered.

count_results("PHD ph.d")

	id	PHD OR ph.d
0	jl_033222419605ed3f	1
1	jl_d5e17d142783f070	1
2	jl_32062be6a68c7531	1
3	jl_5cacf17ffc563847	1
4	jl_81117a845679ae80	1
5	jl_174d886f23f3d05e	1
6	jl_0bb820bd8ae6e87b	1
7	jl_7896e61d3e45dfb4	1
8	jl_62948e9c407ca034	1
9	jl_5f30454eae8a42bd	1
10	jl_65d62034685dd0ed	1
11	jl_04a882314da504d4	1
12	jl_6c9d0349b46d0aae	1
13	jl_10a1355277be089a	1
14	jl_ef56b76c21a4dd91	1
15	jl_6059a43bec56d8f4	1
16	jl_98bed206eb115e04	1
17	jl_44da2bd2b0b7e145	1
18	jl_9315b3a96ad20ac0	1
19	jl_e50079f12b84b9b6	1
20	jl_2370348e80d8c420	1
21	jl_765d5d6d1e3c30af	1
22	jl_770ee63428ff8a22	1
23	jl_0ea00cbc973dc761	1
24	jl_c888c568c29a71b7	1
25	jl_3b58a2212af3ba86	1
26	jl_216284a90c10e500	1
27	jl_151c68314992c211	1
28	jl_117c5543fad4be7e	1
29	jl_5dcae32cb98bfb9b	1
...	...	...
488	jl_ee99d2f1d59e54e5	1
489	jl_c8f54d2003b4e465	1
490	jl_689430243e6a7bea	1
491	jl_5002db1bbda7ff3b	1
492	jl_100daa4903ccb872	1
493	jl_ec512dddb179aae9	1
494	jl_8a3e91781bf16ddb	1
495	jl_2471538c70b44aaf	1
496	jl_506ead9021c7fd81	1
497	jl_d4a605cf31c4d6fa	1
498	jl_f91843d934a9d80f	1
499	jl_7ca7ec20e8a87bef	1
503	jl_9b476549089a6ff7	1
504	jl_9ffa35a19d0fab79	1
505	jl_bddee21cf5be8648	1
507	jl_29c0202cfc89d948	1
509	jl_56a526f9757fc61b	1
510	jl_d6baff3e82b1db8f	1
511	jl_fc1f842786c79ede	1
512	jl_6f1969f9bde9a93f	1
514	jl_7ff9a220edc5f335	1
515	jl_1960eac165fffe54	1
516	jl_e82d2fa409c92914	1
517	jl_849a440bd44fd715	1
518	jl_1130793f98655196	1
519	jl_a742c28826776c08	1
520	jl_7baab1e1b0e8e234	1
521	jl_7e2dba1124b29dcf	1
522	jl_57d36d880dc90113	1
523	jl_85017d6b3a30bdd3	1

501 rows × 2 columns

Share on

Twitter Facebook Google+ LinkedIn

Web Scraping practice: retrieving job IDs and keywords from Indeed.com

Nathan Mitchell

Share on