Source code for fly_tracker.Scraper

"""
Scrapes and returns a csv file with airline price data
Returns:
    pd.Dataframe: Airfare price data between src and dest
"""
import re
import datetime
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

[docs]class PriceScraper():
    """
    Scraper class that scrapes google flights
    """

    def __init__(self, src: str, dest: str, price: int, date: str) -> None:
        self.src = src
        self.dest = dest
        self.price = price
        self.date = date

[docs]    def get_page(self):
        """
        Load dynamic chrome browser and return page source to scrape
        """
        self.preprocess()
        # Open a headless chrome browser
        options = Options()
        options.add_argument('--window-size=1920,1200')
        options.add_argument("--headless")
        driver = webdriver.Chrome(options=options)
        url = f'https://www.google.com/travel/flights/non-stop-flights-from-{self.src}-to-{self.dest}.html'
        driver.get(url)
        driver.find_element(By.XPATH, '//*[@class="RLVa8 GeHXyb"]').click()
        trip_type = driver.find_element(
            By.XPATH, '//div[contains(@class, "yRXJAe iWO5td")]'
        )
        trip_type.find_element(By.XPATH, '//*[@class="Akxp3 Lxea9c"]').find_element(
            By.XPATH, '//*[@class="uT1UOd"]').click()
        date_box = driver.find_element(
            By.XPATH, '//*[contains(@class, "eoY5cb j0Ppje")]')
        time.sleep(2)

        driver.execute_script("arguments[0].value=''", date_box)
        date_box.send_keys(self.date)
        date_box.send_keys(Keys.ENTER)
        time.sleep(2)
        return driver.page_source
[docs]    def soupify(self,page: str) -> BeautifulSoup:
        """
        Return page to scrape as a BeautifulSoup object
        Args:
            page (str): Source page
        Returns:
            BeautifulSoup: parsed bs4 object
        """
        soup = BeautifulSoup(page, features="html.parser")
        return soup
[docs]    def parser(self,soup: BeautifulSoup) -> "list[dict]":
        """
        Helper parser function that scrapes required details

        Args:
            soup (BeautifulSoup): Soup object to scrape

        Returns:
            list[dict]: list of dictionaries which store scraped flight information records
        """
        flights = soup.find_all(class_='pIav2d')
        data = []

        for flight in flights:
            dep_time = flight.find(class_='wtdjmc YMlIz ogfYpf tPgKwe').text
            dep_city = flight.find(class_='G2WY5c sSHqwe ogfYpf tPgKwe').text
            arr_time = flight.find(class_='XWcVob YMlIz ogfYpf tPgKwe').text
            arr_city = flight.find(class_='c8rWCd sSHqwe ogfYpf tPgKwe').text
            price = flight.find(class_=re.compile('YMlIz FpEdX')).text
            if price > self.price:
                continue
            airline = flight.find(class_='h1fkLb').span.text
            timestamp = datetime.datetime.now()
            info = {
                "Source": dep_city,
                "Departure Time": dep_time,
                "Destination": arr_city,
                "Arrival Time": arr_time,
                "Date": self.date,
                "Price": price,
                "Airline": airline,
                "Timestamp": timestamp
            }
            data.append(info)
        return data

[docs]    def create_df(self, data: "list[dict]") -> pd.DataFrame:
        """
        Helper function to convert data into a Pandas Dataframe

        Args:
            data (list[dict]): Flight Information data

        Returns:
            pd.DataFrame: data in the form of a pandas Dataframe
        """

        df = pd.DataFrame(data)
        return df
[docs]    def preprocess(self) -> None:
        """
        Helper Preprocessing function
        """
        self.src = self.src.replace(' ', '-')
        self.dest = self.dest.replace(' ', '-')