1
PVD Python Scripts / Re: PVD Selenium MOD v4 IMDb Movie, People and FilmAffinity Scripts
« Last post by afrocuban on January 25, 2026, 08:25:04 pm »I have tried in my environment with your
time.sleep(random.uniform(8, 12))
and got 02:56.68
And tried with another approach by starting 4 instances of chrome manually and with:
I got 02:03.27 for the same movie Carrie, so basically my assumption that it would least a minute or so longer is basically correct, at least in my environment. If you could try too, it would be good to compare.
time.sleep(random.uniform(8, 12))
and got 02:56.68
And tried with another approach by starting 4 instances of chrome manually and with:
Quote
# Function to download a page and handle "See more" clicks for specific pages
def download_page(download_url, output_path, port, retries=3):
time.sleep(random.uniform(1.0, 3.0))
logging.debug(f"Starting download for URL: {download_url} on port {port}")
logging.debug(f"Output path: {output_path}")
marker_file_path = os.path.splitext(output_path)[0] + "_status.txt" # Define marker file path
attempt = 0
success = False
try:
while attempt < retries:
attempt += 1
logging.debug(f"Attempt {attempt} for URL: {download_url}")
try:
# Attach to existing Chrome instance on given port
service = Service(chrome_path)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{port}")
driver = webdriver.Chrome(service=service, options=chrome_options)
logging.info(f"Attached to Chrome on port {port}")
try:
# Navigate to the target page
driver.get(download_url)
logging.info(f"Page {download_url} loaded successfully.")
# Wait for the page to load
WebDriverWait(driver, 3).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# Click all "See more" buttons only for specific pages
if any(keyword in download_url for keyword in [
'fullcredits', 'awards', 'keywords', 'releaseinfo', 'plotsummary',
'reviews', 'companycredits', 'locations', 'technical',
'externalsites', 'movieconnections'
]):
def click_all_or_more_buttons():
"""
Clicks every 'See all' or 'X more' button on the page,
starting from the bottom to handle pages like Movie Connections.
Waits for content to load after each click.
"""
while True:
try:
# Find all current see-more buttons
buttons = driver.find_elements(
By.XPATH, "//button[contains(@class, 'ipc-see-more__button')]"
)
if not buttons:
break
# Reverse order: start from bottom-most button
buttons = list(reversed(buttons))
clicked_any = False
for button in buttons:
try:
text = button.text.strip().lower()
if "all" in text or "more" in text:
logging.info(f"Clicking button with text: {text}")
driver.execute_script("arguments[0].scrollIntoView(true);", button)
time.sleep(0.5)
driver.execute_script("arguments[0].click();", button)
# Wait until the button becomes stale (DOM updated)
WebDriverWait(driver, 10).until(EC.staleness_of(button))
clicked_any = True
break # re-find buttons after DOM update
except Exception as e:
logging.warning(f"Could not click button: {e}")
if not clicked_any:
break
except Exception:
break
click_all_or_more_buttons()
# Retrieve the full source HTML of the page after all "More" buttons are clicked
html_source = driver.page_source
logging.debug(f"HTML source length: {len(html_source)}")
# Detect AWS WAF challenge page before saving
if "challenge-container" in html_source or "awswaf.com" in html_source:
logging.warning(f"IMDb WAF challenge detected for {download_url}. Skipping normal save.")
# Append the skipped URL to imdb_skipped_urls.log (one per line)
with open("imdb_skipped_urls.log", "a", encoding="utf-8") as skip_log:
skip_log.write(download_url + "\n")
success = False # mark as failed so marker file shows FAILED
else:
# Save the HTML using helper (sets success via try/except)
try:
save_artifacts(driver, output_path)
success = True
logging.info(f"HTML saved to file: {output_path}")
except Exception as e:
logging.error(f"Failed to save artifacts for {download_url}: {e}")
success = False
# If successful, set the success flag and break
if success:
break
except WebDriverException as e:
logging.error(f"An error occurred while processing {download_url}: {e}")
except Exception as e:
logging.error(f"An error occurred: {e}")
finally:
try:
# Reset the window instead of closing it
driver.get("about:blank")
logging.info("Reset Chrome window to blank, still running.")
except Exception as e:
logging.warning(f"Could not reset window: {e}")
finally:
# Always create the marker file, even if everything failed
with open(marker_file_path, "w") as marker_file:
if success and os.path.exists(output_path):
marker_file.write("SUCCESS")
logging.info(f"Marker file created: {marker_file_path} with status SUCCESS")
elif not success:
marker_file.write("FAILED")
logging.info(f"Marker file created: {marker_file_path} with status FAILED")
else:
marker_file.write("NOT_FOUND")
logging.info(f"Marker file created: {marker_file_path} with status NOT_FOUND")
from concurrent.futures import ThreadPoolExecutor, as_completed
# Define the ports for the Chrome instances you started manually
ports = [9222, 9223, 9224, 9225]
# Main script execution - use ThreadPoolExecutor for clean concurrency
try:
# Adjust max_workers to control how many browser sessions run in parallel
with ThreadPoolExecutor(max_workers=4) as executor:
# Submit all download tasks, assigning ports round-robin
futures = {
executor.submit(download_page, url, save_path, ports[i % len(ports)]): (url, save_path)[/i]
for i, (url, save_path) in enumerate(URLS_AND_PATHS.items())
}
# Wait for all tasks to complete
for future in as_completed(futures):
url, save_path = futures[future]
try:
future.result() # raises exception if download_page failed
logging.info(f"Download task finished for {url}")
except Exception as e:
logging.error(f"Download task failed for {url}: {e}")
# Build list of marker files for all processed URLs
marker_files = [os.path.splitext(save_path)[0] + "_status.txt"
for save_path in URLS_AND_PATHS.values()]
# Check markers to decide final status
if all(os.path.exists(path) and "SUCCESS" in open(path).read()
for path in marker_files):
logging.info("All pages have been saved successfully.")
else:
logging.warning("Some pages failed. Check marker files for details.")
except Exception as e:
logging.error(f"An error occurred during threading: {e}")
sys.exit(1) # Exit the script with an error code
# Exit the script successfully
sys.exit(0) # Exit successfully
I got 02:03.27 for the same movie Carrie, so basically my assumption that it would least a minute or so longer is basically correct, at least in my environment. If you could try too, it would be good to compare.
Recent Posts