English > Development

Integrating Selenium to PVD

<< < (4/4)

afrocuban:
Here's selenium script that expands "more"-like buttons and captures whole content of the page, static and dynamic:


--- Quote ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import os

# Paths
CHROME_DRIVER_PATH = r"C:\ChromeDriver-win64\chromedriver.exe"
CHROME_BINARY_PATH = r"C:\PATH_TO\chrome.exe"
SAVE_PATH = r"C:\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Awards.mhtml"

# IMDb URL
IMDB_URL = "https://www.imdb.com/name/nm0190859/awards/"

# Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = CHROME_BINARY_PATH  # Specify the Chrome binary location
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")  # Disable GPU for stability
# Remove the headless mode option for non-headless browsing
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# Service object for ChromeDriver
service = Service(executable_path=CHROME_DRIVER_PATH)

# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)

# Navigate to the IMDb page
driver.get(IMDB_URL)

# Wait for the page to fully load
time.sleep(3)  # Adjusted sleep time for faster loading

# Function to find and click each "See More" button one by one
def click_see_more_buttons():
    while True:
        buttons = driver.find_elements(By.XPATH, "//span[@class='ipc-btn__text']//span[contains(@class, 'ipc-see-more__text')]")
        if not buttons:
            break
        for button in buttons:
            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", button)  # Scroll to the button to ensure its visible
                time.sleep(1)
                button.click()
                print(f"Clicked a 'See More' button.")
                time.sleep(1)  # Short delay to ensure the click is registered
            except Exception as e:
                print(f"Error clicking a 'See More' button: {e}")

# Click the "See More" buttons
click_see_more_buttons()

# Get page source
page_source = driver.page_source

# Constructing the MHTML content manually
mhtml_content = f"""MIME-Version: 1.0
Content-Type: multipart/related; boundary="----=_NextPart_000_0000_01D4E1C0.CE6AA5F0"

This document is a Single File Web Page, also known as a Web Archive file.

------=_NextPart_000_0000_01D4E1C0.CE6AA5F0
Content-Location: {IMDB_URL}
Content-Transfer-Encoding: quoted-printable
Content-Type: text/html; charset=UTF-8

{page_source}

------=_NextPart_000_0000_01D4E1C0.CE6AA5F0--
"""

# Write the MHTML data to the specified file path
with open(SAVE_PATH, "w", encoding="utf-8") as file:
    file.write(mhtml_content)

# Wait to ensure the file is saved
time.sleep(2)  # Adjust the sleep time if necessary

# Confirm file creation
if os.path.exists(SAVE_PATH):
    print(f"Page saved successfully to {SAVE_PATH}")
else:
    print(f"Failed to save the page to {SAVE_PATH}")

# Close the browser
driver.quit()
--- End quote ---

afrocuban:
And here's final headless working version that can be used for all sections. The only further speed up I can think of is running in parallel processes, when selenium is (not to wait each page to be downloaded to be parsed by .psf), but I'm not sure how big that improvement can be on the side of .psf or anywhere else..





--- Quote ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import os


# Paths
CHROME_DRIVER_PATH = r"C:\PATH\TO\chromedriver.exe"
CHROME_BINARY_PATH = r"C:\PATH\TO\chrome.exe"
SAVE_PATH = r"C:\PATH\TO\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Awards.mhtml"


# IMDb URL
IMDB_URL = "https://www.imdb.com/name/nm0190859/awards/"


# Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = CHROME_BINARY_PATH  # Specify the Chrome binary location
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")  # Disable GPU for stability
# Uncomment the headless mode option for headless browsing
chrome_options.add_argument("--headless")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")


# Service object for ChromeDriver
service = Service(executable_path=CHROME_DRIVER_PATH)


# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)


# Adding cookie
cookie = {'name': 'example_cookie', 'value': 'cookie_value'}
driver.get(IMDB_URL)
driver.add_cookie(cookie)


# Navigate to the IMDb page
driver.get(IMDB_URL)


# Wait for the page to fully load
time.sleep(3)  # Adjusted sleep time for faster loading


# Function to find and click each "See More" button one by one
def click_see_more_buttons():
    while True:
        buttons = driver.find_elements(By.XPATH, "//span[@class='ipc-btn__text']//span[contains(@class, 'ipc-see-more__text')]")
        if not buttons:
            break
        for button in buttons:
            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", button)  # Scroll to the button to ensure its visible
                time.sleep(1)
                button.click()
                print(f"Clicked a 'See More' button.")
                time.sleep(1)  # Short delay to ensure the click is registered
            except Exception as e:
                print(f"Error clicking a 'See More' button: {e}")


# Click the "See More" buttons
click_see_more_buttons()


# Get page source
page_source = driver.page_source


# Constructing the MHTML content manually
mhtml_content = f"""MIME-Version: 1.0
Content-Type: multipart/related; boundary="----=_NextPart_000_0000_01D4E1C0.CE6AA5F0"


This document is a Single File Web Page, also known as a Web Archive file.


------=_NextPart_000_0000_01D4E1C0.CE6AA5F0
Content-Location: {IMDB_URL}
Content-Transfer-Encoding: quoted-printable
Content-Type: text/html; charset=UTF-8


{page_source}


------=_NextPart_000_0000_01D4E1C0.CE6AA5F0--
"""


# Write the MHTML data to the specified file path
with open(SAVE_PATH, "w", encoding="utf-8") as file:
    file.write(mhtml_content)


# Wait to ensure the file is saved
time.sleep(2)  # Adjust the sleep time if necessary


# Confirm file creation
if os.path.exists(SAVE_PATH):
    print(f"Page saved successfully to {SAVE_PATH}")
else:
    print(f"Failed to save the page to {SAVE_PATH}")


# Close the browser
driver.quit()

--- End quote ---

afrocuban:
And here is integrated selenium script that downloads in parallel all the pages needed for parsing. Genres has to be html, because of what has to be parsed, otherwise in mhtml it's not accessible.




--- Quote ---import threading
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os


# Paths
CHROME_DRIVER_PATH = r"Q:\Portableapps\ChromeDriver-win64\chromedriver.exe"
CHROME_BINARY_PATH = r"Q:\Portableapps\GoogleChromePortable64\App\Chrome-bin\chrome.exe"


# IMDb URLs and their corresponding save paths
URLS_AND_PATHS = {
    "https://www.imdb.com/name/nm0190859/awards/": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Awards.mhtml",
    "https://www.imdb.com/name/nm0190859/": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Main.mhtml",
    "https://www.imdb.com/name/nm0190859/bio/": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Bio.mhtml",
    "https://www.imdb.com/search/title/?explore=genres&role=nm0190859": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Genres.html",  # Change to .html for classic HTML
    "https://www.imdb.com/name/nm0190859/?showAllCredits=true": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Credit.mhtml"
}


# Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = CHROME_BINARY_PATH  # Specify the Chrome binary location
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")  # Disable GPU for headless mode stability
chrome_options.add_argument("--headless")  # Running Chrome in headless mode
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")


# Function to download a page
def download_page(IMDB_URL, SAVE_PATH):
    # Service object for ChromeDriver
    service = Service(executable_path=CHROME_DRIVER_PATH)


    # Initialize the WebDriver
    driver = webdriver.Chrome(service=service, options=chrome_options)


    # Add a custom cookie
    driver.get("https://www.imdb.com")  # Open the base URL to set the cookie
    cookie = {'name': 'example_cookie', 'value': 'example_value', 'domain': 'imdb.com'}
    driver.add_cookie(cookie)


    # Navigate to the IMDb page
    driver.get(IMDB_URL)


    # Wait for the page to fully load and specific element to ensure all content is loaded
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "span.ipc-title__text"))
        )
    except Exception as e:
        print(f"Error waiting for the page to load: {e}")


    # Special handling for the awards page
    if "awards" in IMDB_URL:
        # Function to find and click each "See More" button one by one
        def click_see_more_buttons():
            while True:
                buttons = driver.find_elements(By.XPATH, "//span[@class='ipc-btn__text']//span[contains(@class, 'ipc-see-more__text')]")
                if not buttons:
                    break
                for button in buttons:
                    try:
                        driver.execute_script("arguments[0].scrollIntoView(true);", button)  # Scroll to the button to ensure its visible
                        time.sleep(1)
                        button.click()
                        print(f"Clicked a 'See More' button.")
                        time.sleep(1)  # Short delay to ensure the click is registered
                    except Exception as e:
                        print(f"Error clicking a 'See More' button: {e}")


        # Click the "See More" buttons
        click_see_more_buttons()


    # Get page source
    page_source = driver.page_source


    # Save page source as MHTML or HTML
    if IMDB_URL == "https://www.imdb.com/search/title/?explore=genres&role=nm0190859":
        # Save as classic HTML
        with open(SAVE_PATH, "w", encoding="utf-8") as file:
            file.write(page_source)
        print(f"Page saved as HTML to {SAVE_PATH}")
    else:
        # Constructing the MHTML content manually
        mhtml_content = f"""MIME-Version: 1.0
Content-Type: multipart/related; boundary="----=_NextPart_000_0000_01D4E1C0.CE6AA5F0"


This document is a Single File Web Page, also known as a Web Archive file.


------=_NextPart_000_0000_01D4E1C0.CE6AA5F0
Content-Location: {IMDB_URL}
Content-Transfer-Encoding: quoted-printable
Content-Type: text/html; charset=UTF-8


{page_source}


------=_NextPart_000_0000_01D4E1C0.CE6AA5F0--
"""


        # Write the MHTML data to the specified file path
        with open(SAVE_PATH, "w", encoding="utf-8") as file:
            file.write(mhtml_content)
        print(f"Page saved as MHTML to {SAVE_PATH}")


    # Wait to ensure the file is saved
    time.sleep(2)  # Adjust the sleep time if necessary


    # Confirm file creation
    if os.path.exists(SAVE_PATH):
        print(f"Page saved successfully to {SAVE_PATH}")
    else:
        print(f"Failed to save the page to {SAVE_PATH}")


    # Close the browser
    driver.quit()


# Create and start threads for each URL
threads = []
for url, save_path in URLS_AND_PATHS.items():
    thread = threading.Thread(target=download_page, args=(url, save_path))
    threads.append(thread)
    thread.start()


# Wait for all threads to complete
for thread in threads:
    thread.join()


print("All pages have been saved successfully.")

--- End quote ---

Next steps:
1. Testing .psf just parsing these pages, for the speficic person.
2. Testing passing person url to selenium script, waiting for downloading to finish.
3. Adjusting selenium script to accept parameters (urls, title, year) and to process them.
4. Testing .psf to get back results and parse them, format them and populate them to PVD.

Ivek23:
This is out of the question for me because I only use Firefox and geckodriver.

afrocuban:
Well, I don't use any major browser at all actually. I'm too into security. If there wasn't PVD I wouldn't use Windows at all. That's why I have my Windows virtual machine - only to be able to have PVD. But I don't care if in this case it's firefox or chrome, because I will not use them anyway. Selenium will, and neither me or anyone else will notice. Except there is a reason I'm not aware of, for which I apologize in advance!

Navigation

[0] Message Index

[*] Previous page

Go to full version