Source code for fly_tracker.Scraper

"""
Scrapes and returns a csv file with airline price data
Returns:
    pd.Dataframe: Airfare price data between src and dest
"""
import re
import datetime
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

[docs]class PriceScraper(): """ Scraper class that scrapes google flights """ def __init__(self, src: str, dest: str, price: int, date: str) -> None: self.src = src self.dest = dest self.price = price self.date = date
[docs] def get_page(self): """ Load dynamic chrome browser and return page source to scrape """ self.preprocess() # Open a headless chrome browser options = Options() options.add_argument('--window-size=1920,1200') options.add_argument("--headless") driver = webdriver.Chrome(options=options) url = f'https://www.google.com/travel/flights/non-stop-flights-from-{self.src}-to-{self.dest}.html' driver.get(url) driver.find_element(By.XPATH, '//*[@class="RLVa8 GeHXyb"]').click() trip_type = driver.find_element( By.XPATH, '//div[contains(@class, "yRXJAe iWO5td")]' ) trip_type.find_element(By.XPATH, '//*[@class="Akxp3 Lxea9c"]').find_element( By.XPATH, '//*[@class="uT1UOd"]').click() date_box = driver.find_element( By.XPATH, '//*[contains(@class, "eoY5cb j0Ppje")]') time.sleep(2) driver.execute_script("arguments[0].value=''", date_box) date_box.send_keys(self.date) date_box.send_keys(Keys.ENTER) time.sleep(2) return driver.page_source
[docs] def soupify(self,page: str) -> BeautifulSoup: """ Return page to scrape as a BeautifulSoup object Args: page (str): Source page Returns: BeautifulSoup: parsed bs4 object """ soup = BeautifulSoup(page, features="html.parser") return soup
[docs] def parser(self,soup: BeautifulSoup) -> "list[dict]": """ Helper parser function that scrapes required details Args: soup (BeautifulSoup): Soup object to scrape Returns: list[dict]: list of dictionaries which store scraped flight information records """ flights = soup.find_all(class_='pIav2d') data = [] for flight in flights: dep_time = flight.find(class_='wtdjmc YMlIz ogfYpf tPgKwe').text dep_city = flight.find(class_='G2WY5c sSHqwe ogfYpf tPgKwe').text arr_time = flight.find(class_='XWcVob YMlIz ogfYpf tPgKwe').text arr_city = flight.find(class_='c8rWCd sSHqwe ogfYpf tPgKwe').text price = flight.find(class_=re.compile('YMlIz FpEdX')).text if price > self.price: continue airline = flight.find(class_='h1fkLb').span.text timestamp = datetime.datetime.now() info = { "Source": dep_city, "Departure Time": dep_time, "Destination": arr_city, "Arrival Time": arr_time, "Date": self.date, "Price": price, "Airline": airline, "Timestamp": timestamp } data.append(info) return data
[docs] def create_df(self, data: "list[dict]") -> pd.DataFrame: """ Helper function to convert data into a Pandas Dataframe Args: data (list[dict]): Flight Information data Returns: pd.DataFrame: data in the form of a pandas Dataframe """ df = pd.DataFrame(data) return df
[docs] def preprocess(self) -> None: """ Helper Preprocessing function """ self.src = self.src.replace(' ', '-') self.dest = self.dest.replace(' ', '-')