// Download the URL page.
LogMessage(Chr(9) + Chr(9) + ' Download with PVdBDownPage in file:|' + ScriptPath + OutFile + ' the information of:|' + URL + '||');
FileExecute(ScriptPath + 'PVdBDownPage.exe', '"' + URL + '" "' + ScriptPath + OutFile + '"');
is likely due to the JavaScript that loads the video elements dynamically on the original page. When you download the page using DownloadPage, the JavaScript doesn’t execute, so the dynamic content (like embedded trailers) isn’t included in the downloaded HTML file.
To work around this, we have a couple of options:
Use a Web Browser Automation Tool: Tools like Selenium can control a web browser to visit the page and let all JavaScript execute before saving the fully rendered page’s HTML. This way, the dynamic content will be included.
Find the Video URL Manually: Check the original HTML of the page in your browser (right-click and “View Page Source” or inspect elements with Developer Tools). Look for the actual URLs of the video files. You can then use those URLs directly in your script.
Using a web browser automation tool like Selenium is a bit more involved and requires additional setup, but it is a more robust solution for handling dynamic content. If you prefer to go this route, I can guide you through the setup process.
Alternatively, manually inspecting the page’s source for video URLs is quicker but less convenient for repeated tasks.
Here's the rest of the conversation:Invoking Selenium directly from within a PVD script (written in Pascal) is not straightforward because Selenium typically operates via Python, Java, or another high-level language. However, you can use a workaround by invoking an external Python script from your PVD Pascal script.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Setup the WebDriver
driver = webdriver.Chrome(executable_path='path_to_chromedriver')
# Navigate to the URL
driver.get("https://www.filmaffinity.com/en/evideos.php?movie_id=xxxx (https://www.filmaffinity.com/en/evideos.php?movie_id=xxxx)")
# Wait for the trailers to load (adjust the waiting condition as needed)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "divvid"))
)
# Get the fully rendered HTML
html = driver.page_source
# Save the HTML to a file
with open('downpage_trailer-UTF8_NO_BOM_FA.htm', 'w', encoding='utf-8') as file:
file.write(html)
# Close the browser
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Setup the WebDriver
driver = webdriver.Chrome(executable_path='path_to_chromedriver')
# Navigate to the URL
driver.get("https://www.filmaffinity.com/en/evideos.php?movie_id=xxxx (https://www.filmaffinity.com/en/evideos.php?movie_id=xxxx)")
# Wait for the trailers to load (adjust the waiting condition as needed)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "divvid"))
)
# Get the fully rendered HTML
html = driver.page_source
# Save the HTML to a file
with open('downpage_trailer-UTF8_NO_BOM_FA.htm', 'w', encoding='utf-8') as file:
file.write(html)
# Close the browser
driver.quit()
// Log the actionIn this setup:
LogMessage(Chr(9) + Chr(9) + ' Download with Selenium in file:|' + ScriptPath + OutFile + ' the information of:|' + URL + '||');
// Construct the command to run the Python script
FileExecute('python.exe', ScriptPath + 'selenium_script.py');
from selenium import webdriver2. Pascal Script (Invoke Selenium Script)
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sys
# Get the URL from the command line arguments
url = sys.argv[1]
output_file = sys.argv[2]
# Setup the WebDriver
driver = webdriver.Chrome(executable_path='path_to_chromedriver')
# Navigate to the URL
driver.get(url)
# Wait for the trailers to load (adjust the waiting condition as needed)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "divvid"))
)
# Get the fully rendered HTML
html = driver.page_source
# Save the HTML to a file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(html)
# Close the browser
driver.quit()
// Log the actionBy following this workflow, you can effectively integrate Selenium into your PVD script to handle pages with dynamic content. This way, you can ensure that all necessary content, like embedded trailers, is included in the HTML that your PVD script processes.
LogMessage(Chr(9) + Chr(9) + ' Download with Selenium in file:|' + ScriptPath + OutFile + ' the information of:|' + URL + '||');
// Construct the command to run the Python script
FileExecute('python.exe', '"' + ScriptPath + 'selenium_script.py" "' + URL + '" "' + ScriptPath + OutFile + '"');
from selenium import webdriverInvoke the Python Script from Your PVD Script:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sys
# Get the URL and output file from the command line arguments
url = sys.argv[1]
output_file = sys.argv[2]
# Setup the WebDriver
driver = webdriver.Chrome(executable_path='path_to_chromedriver')
# Navigate to the URL
driver.get(url)
# Wait for the trailers to load (adjust the waiting condition as needed)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "divvid"))
)
# Get the fully rendered HTML
html = driver.page_source
# Save the HTML to a file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(html)
# Close the browser
driver.quit()
// Log the actionHow It Works
LogMessage(Chr(9) + Chr(9) + ' Download with Selenium in file:|' + ScriptPath + OutFile + ' the information of:|' + URL + '||');
// Construct the command to run the Python script
FileExecute('python.exe', '"' + ScriptPath + 'selenium_script.py" "' + URL + '" "' + ScriptPath + OutFile + '"');
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import sys # Get the URL and output file from the command line arguments url = sys.argv[1] output_file = sys.argv[2] # Setup the WebDriver driver = webdriver.Chrome(executable_path='path_to_chromedriver') # Navigate to the URL driver.get(url) # Wait for the "12 more" button to be clickable and click it more_button = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '12 more')]")) ) more_button.click() # Wait for the new content to load WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='sub-section-akas']")) ) # Get the fully rendered HTML html = driver.page_source # Save the HTML to a file with open(output_file, 'w', encoding='utf-8') as file: file.write(html) # Close the browser driver.quit()2. Invoke the Python Script from Your PVD Script:
// Log the action LogMessage(Chr(9) + Chr(9) + ' Download with Selenium in file:|' + ScriptPath + OutFile + ' the information of:|' + URL + '||'); // Construct the command to run the Python script FileExecute('python.exe', '"' + ScriptPath + 'selenium_imdb.py" "' + URL + '" "' + ScriptPath + OutFile + '"');Summary
In the Python attachment, there are fetch_imdbs_titles.py and imdb_aka_fetcher.py scripts to help and see the specific paths of geckodriver for firefox when writing Python scripts.
I would like to ask you to make a Python Script (selenium_script.py) and Pascal Script (Invoke Selenium Script) path to IMDB_[EN][HTTPS]_TEST_2c 2c script for me and add all this in the attachment here.
Where do I need to add geckodriver for firefox.
In the Python attachment, there are fetch_imdbs_titles.py and imdb_aka_fetcher.py scripts to help and see the specific paths of geckodriver for firefox when writing Python scripts.
I would like to ask you to make a Python Script (selenium_script.py) and Pascal Script (Invoke Selenium Script) path to IMDB_[EN][HTTPS]_TEST_2c 2c script for me and add all this in the attachment here.
Where do I need to add geckodriver for firefox.
It isn't clear who are you asking to do this, but in case you are asking me, I am still at the very beginning of even comprehending the concept, not to say to code. Interacting with AI can be and is extremely frustrating, and whatever I tried, I needed to try it live, otherwise I had to started over each time. Meaning, asking me to provide it for you isn't productive way, unless you too want to get crazy like I did while upgrading FA script, hahahah. To get there to be able to parse FA trailers page (meaning to be able to download and parse dynamic content of HTML on FA), I think I'll need a month at least, but I'm not surrendering.
Meanwhile, I started to fix and upgrade IMDb people script. I already fixed "bio" field, but I need and want to further tweak, update and upgrade it before meaningfully post it.
I installed: selenium, beautifulsoap4, node.js and puppeteer. No solution downloaded dynamic content locally.
We have no choice for now, it looks....
I tried it for People credits only...
I installed: selenium, beautifulsoap4, node.js and puppeteer. No solution downloaded dynamic content locally.
We have no choice for now, it looks....
I tried it for People credits only...
Here is the url for Aaron Spelling credits
https://www.imdb.com/name/nm0005455/?showAllCredits=true (https://www.imdb.com/name/nm0005455/?showAllCredits=true)
I already found a solution for AKA titles. How to download them all and they work in test form, but there are still some details missing that need to be tested as well as selenium_script.py .
IMDB_[EN][HTTPS]_TEST_2c 2c script I had to change some parts of the code so that now Function ParsePage_IMDBMovieAKA is the only one that is used.
import sys
import os
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
import time
# Preverite, ali je IMDb URL podan kot parameter
if len(sys.argv) < 2:
print("IMDb URL ni bil posredovan kot parameter.")
sys.exit(1)
imdb_url = sys.argv[1] # IMDb URL iz ukazne vrstice
# Pot do geckodriver.exe
gecko_path = "C:/Projects/geckodriver.exe" # Prilagodite pot glede na lokacijo gonilnika
# Pridobite trenutno pot aplikacije
app_path = os.path.dirname(os.path.abspath(__file__)) # Pot do trenutne Python skripte
# Preverite, ali je vaša "PVD_0.9.9.21_MOD-Simple AllMovies" mapa na D: disku ali drugje
pvd_path = "D:\MyPVD\PVD_0.9.9.21_MOD-Simple AllMovies" # Nastavite to pot enkrat, da se ne spreminja
# Če želite univerzalno pot, uporabite app_path za združitev
output_path = os.path.join(pvd_path, "Scripts", "Tmp", "downpage-UTF8_NO_BOM.htm")
# Preverite, ali mapa obstaja, če ne, jo ustvarite
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Ustvarite objekt za brskalnik
service = Service(gecko_path)
driver = webdriver.Firefox(service=service)
try:
# Odprite IMDb stran
driver.get(imdb_url)
print(f"Stran {imdb_url} je naložena.")
# Počakajte, da se stran naloži
time.sleep(5)
# Pridobite celoten izvorni HTML strani
html_source = driver.page_source
# Shranite HTML v datoteko
with open(output_path, 'w', encoding='utf-8') as file:
file.write(html_source)
print(f"HTML je shranjen v datoteko: {output_path}")
finally:
# Zaprite brskalnik
driver.quit()
Function GetDownloadURL:AnsiString; //BlockOpen
Var
curPos:Integer;
ScriptPath,MovieID:String;
Begin
LogMessage('Testna inicializacija log sistema.');
LogMessage('Testno sporočilo: Log deluje.');
LogMessage('Function GetDownloadURL BEGIN======================|');
LogMessage('Global Var-Mode|'+IntToStr(Mode)+'|');
LogMessage('Global Var-DownloadURL|'+DownloadURL+'|');
//Comprobation of needed external files.
ScriptPath:=GetAppPath+'Scripts\';
If Not(FileExists(ScriptPath+'PVdBDownPage.exe')) Then Begin
ShowMessage ('This script needs the external file for work.'+Chr(13)+'• PVdBDownPage.exe'+Chr(13)+'Read script text for futher information',SCRIPT_NAME);
Mode:=smFinished;
Result:=''; //If error returns empty string
exit;
End;
If (Mode=smSearch) Then Begin
//Get stored URL if exist.
StoredURL:=GetFieldValueXML('url');
LogMessage('Stored URL is:'+StoredURL+'||');
//Standarize the URL
StoredURL:=LowerCase(StoredURL);
StoredURL:=StringReplace(StoredURL,'https','http',True,True,False);
StoredURL:=StringReplace(StoredURL,'http://imdb.com/', 'http://www.imdb.com/', True,True,False);
StoredURL:=StringReplace(StoredURL,'http://httpbin.org/response-headers?key=','',True,False,False);
StoredURL:=StringReplace(StoredURL,' ',BASE_URL_SUF,True,True,False)+BASE_URL_SUF; //Asure that the URLs always finish BASE_URL_SUF (even in the last position
LogMessage('* Stored URL is:'+StoredURL+'||');
//Get IMDB ID if exist.
curPos:=Pos(BASE_URL_PRE,StoredURL);
If 0<curPos Then Begin //Get IMDB_ID for search
LogMessage(' IMDB URL.');
MovieID:=TextBetWeen(StoredURL,BASE_URL_PRE,BASE_URL_SUF,false,curPos); //WEB_SPECIFIC
DownloadURL:=BASE_URL_PRE_TRUE+ MovieID +BASE_URL_SUF; //WEB_SPECIFIC
LogMessage(' Parse stored information DownloadURL:'+DownloadURL+' ||');
Mode:=smNormal; //->Go to function ParsePage for parse the film information
Result:=GetAppPath+DUMMY_HTML_FILE; //Any existing little file for cheating PVdB automatic download (little).).
LogMessage('Function GetDownloadURL END====================== with Mode='+IntToStr(Mode)+' Result='+Result+'|');
exit;
End Else Begin //The movie URL not exist, search mode needed. Download the search page.
//ShowMessage('No IMDB URL.',SCRIPT_NAME);
LogMessage(' No IMDB URL.');
Mode:=smSearch; //->Go to function ParsePage for search the URL (in this funtion you can't not use user funtions)ntions)
DownloadURL:=''; //Has not movie URL.
Result:=GetAppPath+DUMMY_HTML_FILE; //Any existing little file for cheating PVdB automatic download (little).).
LogMessage('Function GetDownloadURL END====================== with Mode='+IntToStr(Mode)+' Result='+Result+'|');
exit; //Go to the
End;
End;
//Not other modes working needs in this function.
//smNormal = 1; //This scripts download with external program (not with GetDownloadURL) so it only make one pass to ParsePage for retrieve all info, credits, poster, etc. other field modes aren't necesarye pass to ParsePage for retrieve all info, credits, poster, etc. other field modes aren't necesary
//smSearchList = 8; //Used in ParsePage for demands download the https link returned by user in the window of (AddSearchResult)(AddSearchResult)
Result:=GetAppPath+DUMMY_HTML_FILE; //Any existing little file for cheating PVdB automatic download (little).
LogMessage('Function GetDownloadURL END====================== with Mode='+IntToStr(Mode)+' Result='+Result+'|');
exit;
End; //BlockClose
.
.
.
Function DownloadPage(URL:AnsiString):String; //BlockOpen
//Returns the URL page text. If error returns empty string
Var
i:Integer;
ScriptPath,WebText:String;
Begin
LogMessage(Chr(9)+Chr(9)+'Function DownloadPage BEGIN======================|');
LogMessage(Chr(9)+Chr(9)+'Global Var-DownloadURL|'+DownloadURL+' |');
LogMessage(Chr(9)+Chr(9)+' Local Var-URL|'+URL+' |');
ScriptPath:=GetAppPath+'Scripts\';
//Delete the ancient downloaded page file.
While FileExists(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM) Do Begin
LogMessage(Chr(9)+Chr(9)+'Deleting existing file: ' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM);
FileExecute('cmd.exe', '/C del "'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+'"');
LogMessage(Chr(9)+Chr(9)+' Waiting 1s for delete:'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
wait (1000);
End;
// Download the URL page.
//LogMessage(Chr(9)+Chr(9)+' Download with PVdBDownPage in file:|'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+' the information of:|'+URL+' ||');
//FileExecute(ScriptPath+'PVdBDownPage.exe', '"'+URL+'" "'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM+'"');
LogMessage(Chr(9) + Chr(9) + ' Download with Selenium in file:| ' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM + ' the information of:|' + URL + '||');
LogMessage(Chr(9)+Chr(9)+'Executing Python script to download URL content.');
FileExecute('python.exe', '"' + ScriptPath + 'selenium_script.py" "' + URL + '" "' + ScriptPath + BASE_DOWNLOAD_FILE_NO_BOM + '"');
// Wait download finish and exist the downloaded page.
i:=0; // INTERNET_TEST_ITERATIONS
While Not(FileExists(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM)) Do Begin
LogMessage(Chr(9)+Chr(9)+' Waiting 2s for exists of:'+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
wait (5000);
i:=i+1;
If i=INTERNET_TEST_ITERATIONS Then Begin
if 2=MessageBox('Too many faulty attempts to internet connection.'+Chr(13)+ 'Retry or Cancel?',SCRIPT_NAME,5) then begin
LogMessage(Chr(9)+Chr(9)+'Function DownloadPage END with NOT INTERNET connection ===============|');
Result:='';
Exit;
End;
i:=0;
End;
End;
LogMessage(Chr(9)+Chr(9)+' Now present complete page file: '+ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
WebText:=FileToString(ScriptPath+BASE_DOWNLOAD_FILE_NO_BOM);
LogMessage(Chr(9)+Chr(9)+'File content length: ' + IntToStr(Length(WebText)));
LogMessage(Chr(9)+Chr(9)+'File content (first 100 chars): ' + Copy(WebText, 1, 100));
WebText:=ConvertEncoding(WebText, 65001);
Result:=WebText;
// Some download data validations.
if (Pos('404 Not Found',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The URL is not in use (404 Not Found).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' 404 Not Found|');
Result:='';
End;
if (Pos('404 Error - IMDb',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The URL is not in use (404 Error - IMDb).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' 404 Error - IMDb|');
Result:='';
End;
if (Pos('Page not found',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The URL is not in use (Page not found).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' Page not found|');
Result:='';
End;
if (Pos('405 Method not allowed',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The URL has HTTP method problems (405 Method not allowed).'+Chr(13)+'Go to the provider web in order to find the good page',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' 405 Method not allowed|');
Result:='';
End;
if (Pos('Too many request',Result)>0) then begin
If BYPASS_SILENT Then ShowMessage('The provider has banned your IP (Too many request).'+Chr(13)+'Go to the provider web and resolve the captcha in order to prove you are not a robot',SCRIPT_NAME);
LogMessage(Chr(9)+Chr(9)+' Banned IP|');
Result:='';
End;
LogMessage('Value BASE_DOWNLOAD_FILE_NO_BOM: ' + BASE_DOWNLOAD_FILE_NO_BOM);
LogMessage(Chr(9)+Chr(9)+'Function DownloadPage END======================|');
exit;
End; //BlockClose
Function DownloadImage(URL:AnsiString;OutPutFile:AnsiString):Integer; //BlockOpen
//Returns 1 or 0 if the downloaded image file exists in Exit.
//Var
//i:Integer;
//ScriptPath:String;
Begin
(*
LogMessage(Chr(9)+Chr(9)+'Function DownloadImage BEGIN======================|');
LogMessage(Chr(9)+Chr(9)+'Global Var-DownloadURL|'+DownloadURL+' |');
LogMessage(Chr(9)+Chr(9)+' Local Var-URL|'+URL+' |');
LogMessage(Chr(9)+Chr(9)+' Local Var-OutPutFile|'+OutPutFile+'|');
ScriptPath:=GetAppPath+'Scripts\';
//Delete the ancient dowloaded page file. Needed for wait to curl download included in PowerShell command.
While FileExists(OutPutFile) Do Begin
FileExecute('cmd.exe', '/C del "'+OutPutFile+'"');
LogMessage(Chr(9)+Chr(9)+' Waiting 1s for delete:'+OutPutFile);
wait (1000);
End;
//Download the URL page.
LogMessage(Chr(9)+Chr(9)+' Download with PVdBDownPage in file:|'+OutPutFile+' the information of:|'+URL+' ||');
FileExecute(ScriptPath+'PVdBDownPage.exe', '"'+URL+'" "'+OutPutFile+'"');
//Wait download finish and exist the downloaded page.
i:=0; // INTERNET_TEST_ITERATIONS
While Not(FileExists(OutPutFile)) Do Begin
LogMessage(Chr(9)+Chr(9)+' Waiting 2s for exists of:'+OutPutFile);
wait (2000);
i:=i+1;
If i=INTERNET_TEST_ITERATIONS Then Begin //In the images download the scritp can not ask to the user for internet conexion because perhaps the file doesn't exist.
LogMessage(Chr(9)+Chr(9)+'Function DownloadImage END with NOT file downloaded ===============|');
Result:=0;
exit;
End;
End;
LogMessage(Chr(9)+Chr(9)+' Now present complete page file: '+OutPutFile);
Result:=1;
LogMessage(Chr(9)+Chr(9)+'Function DownloadImage END======================|');
exit;
*)
End; //BlockClose
.
.
.
Function ParsePage(HTML:String;URL:AnsiString):Cardinal; //BlockOpen
Var
MovieID,titleValue,yearValue:String;
ResultTmp:Cardinal;
Date:String;
Fullinfo,Movie_URL,IMDB_URL:String;
DateParts:TWideArray;
Fullinfo1,MovieID1:String;
Begin
.
.
.
//Parse Also Known As provider page = BASE_URL_AKA-------------------------------------------------------------------
If (GET_FULL_AKA and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin
//If (GET_FULL_AKA and (MediaType='Movie') and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin
//If (GET_FULL_AKA and Not(USE_SAVED_PVDCONFIG and (Copy(PVDConfigOptions,opAKA,1)='0'))) Then Begin
DownloadURL:=StringReplace(BASE_URL_AKA,'%IMDB_ID',MovieID,True,True,False);
HTML:=DownloadPage(DownloadURL); //True page for parsing
//HTML := DownloadPage(DownloadURL, 'Tmp\downpage-UTF8_NO_BOM_AKA.htm'); // True page for parsing
//BASE_DOWNLOAD_FILE_NO_BOM_AKA = 'Tmp\downpage-UTF8_NO_BOM_AKA.htm';
HTML:=HTMLToText(HTML);
ResultTmp:=ParsePage_IMDBMovieAKA(HTML);
If Not(ResultTmp=prFinished) then Result:=ResultTmp;
End;
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
# Paths
CHROME_DRIVER_PATH = r"C:\PersonalVideoDB\Scripts\Tmp\chromedriver.exe"
CHROME_BINARY_PATH = r"C:\GoogleChromePortable64\App\Chrome-bin\chrome.exe"
SAVE_PATH = r"C:\PersonalVideoDB\Scripts\Tmp\downpage-UTF8_NO_BOM-Credit.mhtml"
# IMDb URL
IMDB_URL = "https://www.imdb.com/name/nm0000040/?showAllCredits=true"
# Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = CHROME_BINARY_PATH # Specify the Chrome binary location
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu") # Disable GPU for headless mode stability
chrome_options.add_argument("--headless") # Running Chrome in headless mode
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# Service object for ChromeDriver
service = Service(executable_path=CHROME_DRIVER_PATH)
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
# Add a custom cookie
driver.get("https://www.imdb.com") # Open the base URL to set the cookie
cookie = {'name': 'example_cookie', 'value': 'example_value', 'domain': 'imdb.com'}
driver.add_cookie(cookie)
# Navigate to the IMDb page
driver.get(IMDB_URL)
# Wait for the page to fully load and specific element to ensure all content is loaded
try:
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.ipc-title__text"))
)
except Exception as e:
print(f"Error waiting for the page to load: {e}")
# Get page source
page_source = driver.page_source
# Constructing the MHTML content manually
mhtml_content = f"""MIME-Version: 1.0
Content-Type: multipart/related; boundary="----=_NextPart_000_0000_01D4E1C0.CE6AA5F0"
This document is a Single File Web Page, also known as a Web Archive file.
------=_NextPart_000_0000_01D4E1C0.CE6AA5F0
Content-Location: {IMDB_URL}
Content-Transfer-Encoding: quoted-printable
Content-Type: text/html; charset=UTF-8
{page_source}
------=_NextPart_000_0000_01D4E1C0.CE6AA5F0--
"""
# Write the MHTML data to the specified file path
with open(SAVE_PATH, "w", encoding="utf-8") as file:
file.write(mhtml_content)
# Wait to ensure the file is saved
time.sleep(5) # Adjust the sleep time if necessary
# Confirm file creation
if os.path.exists(SAVE_PATH):
print(f"Page saved successfully to {SAVE_PATH}")
else:
print(f"Failed to save the page to {SAVE_PATH}")
# Close the browser
driver.quit()
from selenium import webdriverKey Points:
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
# Paths
CHROME_DRIVER_PATH = r"Q:\\ChromeDriver-win64\\chromedriver.exe"
CHROME_BINARY_PATH = r"Q:\\GoogleChromePath\\chrome.exe"
# IMDb URL
IMDB_URL = "https://www.imdb.com/name/nm0000040/?showAllCredits=true (https://www.imdb.com/name/nm0000040/?showAllCredits=true)"
# Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = CHROME_BINARY_PATH
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# Service object for ChromeDriver
service = Service(executable_path=CHROME_DRIVER_PATH)
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
# Navigate to the specific IMDb page
driver.get(IMDB_URL)
# Wait for the credits section to load
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".filmo-category-section")))
# Scroll to the bottom to ensure all content is loaded
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(10)
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(10)
# Extract movie titles and roles
credits = driver.find_elements(By.CSS_SELECTOR, ".filmo-category-section .filmo-row")
# Prepare the data for CSV
data = []
for credit in credits:
title_element = credit.find_element(By.CSS_SELECTOR, "b a")
title = title_element.text if title_element else "N/A"
year = credit.find_element(By.CSS_SELECTOR, ".year_column").text.strip()
role_elements = credit.find_elements(By.CSS_SELECTOR, "a[href*='?ref_=nmbio_']") # Adjust the selector to match your needs
roles = [role.text for role in role_elements] if role_elements else ["N/A"]
data.append([title, year, ", ".join(roles)])
# Save the data to a CSV file
csv_path = r"C:\\PersonalVideoDB\\Scripts\\Tmp\\credits.csv"
with open(csv_path, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["Title", "Year", "Roles"])
writer.writerows(data)
# Confirm file creation
if os.path.exists(csv_path):
print(f"Data saved successfully to {csv_path}")
else:
print(f"Failed to save the data to {csv_path}")
# Close the browser
driver.quit()
program CallPython;Step 2: Python Script to Search and Parse IMDb
uses
ShellAPI, SysUtils, Windows;
var
Title, Year: string;
PythonExe, ScriptPath, Parameters: string;
ReturnCode: Integer;
ResultFile: TextFile;
Line: string;
begin
Title := 'MovieTitle'; // These would be input parameters in your actual app
Year := '2021';
PythonExe := 'C:\Path\To\Python\python.exe';
ScriptPath := 'C:\Path\To\Script\imdb_script.py';
Parameters := Format('"%s" "%s" "%s"', [ScriptPath, Title, Year]);
ReturnCode := ShellExecute(0, 'open', PChar(PythonExe), PChar(Parameters), nil, SW_HIDE);
if ReturnCode > 32 then
Writeln('Python script executed successfully')
else
Writeln('Failed to execute Python script');
// Assuming the Python script writes to a result file
AssignFile(ResultFile, 'C:\Path\To\Script\results.txt');
Reset(ResultFile);
while not Eof(ResultFile) do
begin
ReadLn(ResultFile, Line);
Writeln(Line); // Process each line (this would be your actual data processing)
end;
CloseFile(ResultFile);
end.
import sysStep 3: Process Results in Pascal/Delphi
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
CHROME_DRIVER_PATH = "path/to/chromedriver"
IMDB_URL_TEMPLATE = "https://www.imdb.com/find?q={query}&s=tt (https://www.imdb.com/find?q={query}&s=tt)"
def main():
if len(sys.argv) != 3:
print("Usage: imdb_script.py <Title> <Year>")
return
title = sys.argv[1]
year = sys.argv[2]
query = f"{title} {year}"
service = Service(executable_path=CHROME_DRIVER_PATH)
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=service, options=options)
driver.get(IMDB_URL_TEMPLATE.format(query=query))
time.sleep(3) # Wait for page to load
# Assuming first result is the desired movie
first_result = driver.find_element_by_css_selector(".result_text a")
first_result.click()
time.sleep(3) # Wait for movie page to load
movie_title = driver.find_element_by_css_selector("h1").text
movie_year = driver.find_element_by_css_selector("#titleYear a").text
# Collect other relevant data...
director = driver.find_element_by_css_selector("div.credit_summary_item a").text
# Write results to file
with open("path/to/results.txt", "w") as file:
file.write(f"Title: {movie_title}\n")
file.write(f"Year: {movie_year}\n")
file.write(f"Director: {director}\n")
driver.quit()
if __name__ == "__main__":
main()
import sys
import os
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
import time
# Check if IMDb URL is provided as a parameter
if len(sys.argv) < 2:
print("IMDb URL was not provided as a parameter.")
sys.exit(1)
imdb_url = sys.argv[1] # IMDb URL from the command line
# Path to geckodriver.exe
gecko_path = "C:/Projects/geckodriver.exe" # Adjust the path according to the driver location
# Get the current application path
app_path = os.path.dirname(os.path.abspath(__file__)) # Path to the current Python script
# Check if your "PVD_0.9.9.21_MOD-Simple AllMovies" folder is on the D: drive or elsewhere
pvd_path = "D:\\MyPVD\\PVD_0.9.9.21_MOD-Simple AllMovies" # Set this path once, so it does not change
# If you want a universal path, use app_path to combine
output_path = os.path.join(pvd_path, "Scripts", "Tmp", "downpage-UTF8_NO_BOM.htm")
# Check if the folder exists, if not, create it
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Create a browser object
service = Service(gecko_path)
driver = webdriver.Firefox(service=service)
try:
# Open the IMDb page
driver.get(imdb_url)
print(f"The page {imdb_url} is loaded.")
# Wait for the page to load
time.sleep(5)
# Get the entire source HTML of the page
html_source = driver.page_source
# Save the HTML to a file
with open(output_path, 'w', encoding='utf-8') as file:
file.write(html_source)
print(f"HTML is saved to file: {output_path}")
finally:
# Close the browser
driver.quit()
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import os
# Paths
CHROME_DRIVER_PATH = r"C:\ChromeDriver-win64\chromedriver.exe"
CHROME_BINARY_PATH = r"C:\PATH_TO\chrome.exe"
SAVE_PATH = r"C:\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Awards.mhtml"
# IMDb URL
IMDB_URL = "https://www.imdb.com/name/nm0190859/awards/ (https://www.imdb.com/name/nm0190859/awards/)"
# Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = CHROME_BINARY_PATH # Specify the Chrome binary location
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu") # Disable GPU for stability
# Remove the headless mode option for non-headless browsing
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# Service object for ChromeDriver
service = Service(executable_path=CHROME_DRIVER_PATH)
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
# Navigate to the IMDb page
driver.get(IMDB_URL)
# Wait for the page to fully load
time.sleep(3) # Adjusted sleep time for faster loading
# Function to find and click each "See More" button one by one
def click_see_more_buttons():
while True:
buttons = driver.find_elements(By.XPATH, "//span[@class='ipc-btn__text']//span[contains(@class, 'ipc-see-more__text')]")
if not buttons:
break
for button in buttons:
try:
driver.execute_script("arguments[0].scrollIntoView(true);", button) # Scroll to the button to ensure its visible
time.sleep(1)
button.click()
print(f"Clicked a 'See More' button.")
time.sleep(1) # Short delay to ensure the click is registered
except Exception as e:
print(f"Error clicking a 'See More' button: {e}")
# Click the "See More" buttons
click_see_more_buttons()
# Get page source
page_source = driver.page_source
# Constructing the MHTML content manually
mhtml_content = f"""MIME-Version: 1.0
Content-Type: multipart/related; boundary="----=_NextPart_000_0000_01D4E1C0.CE6AA5F0"
This document is a Single File Web Page, also known as a Web Archive file.
------=_NextPart_000_0000_01D4E1C0.CE6AA5F0
Content-Location: {IMDB_URL}
Content-Transfer-Encoding: quoted-printable
Content-Type: text/html; charset=UTF-8
{page_source}
------=_NextPart_000_0000_01D4E1C0.CE6AA5F0--
"""
# Write the MHTML data to the specified file path
with open(SAVE_PATH, "w", encoding="utf-8") as file:
file.write(mhtml_content)
# Wait to ensure the file is saved
time.sleep(2) # Adjust the sleep time if necessary
# Confirm file creation
if os.path.exists(SAVE_PATH):
print(f"Page saved successfully to {SAVE_PATH}")
else:
print(f"Failed to save the page to {SAVE_PATH}")
# Close the browser
driver.quit()
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import os
# Paths
CHROME_DRIVER_PATH = r"C:\PATH\TO\chromedriver.exe"
CHROME_BINARY_PATH = r"C:\PATH\TO\chrome.exe"
SAVE_PATH = r"C:\PATH\TO\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Awards.mhtml"
# IMDb URL
IMDB_URL = "https://www.imdb.com/name/nm0190859/awards/"
# Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = CHROME_BINARY_PATH # Specify the Chrome binary location
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu") # Disable GPU for stability
# Uncomment the headless mode option for headless browsing
chrome_options.add_argument("--headless")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# Service object for ChromeDriver
service = Service(executable_path=CHROME_DRIVER_PATH)
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
# Adding cookie
cookie = {'name': 'example_cookie', 'value': 'cookie_value'}
driver.get(IMDB_URL)
driver.add_cookie(cookie)
# Navigate to the IMDb page
driver.get(IMDB_URL)
# Wait for the page to fully load
time.sleep(3) # Adjusted sleep time for faster loading
# Function to find and click each "See More" button one by one
def click_see_more_buttons():
while True:
buttons = driver.find_elements(By.XPATH, "//span[@class='ipc-btn__text']//span[contains(@class, 'ipc-see-more__text')]")
if not buttons:
break
for button in buttons:
try:
driver.execute_script("arguments[0].scrollIntoView(true);", button) # Scroll to the button to ensure its visible
time.sleep(1)
button.click()
print(f"Clicked a 'See More' button.")
time.sleep(1) # Short delay to ensure the click is registered
except Exception as e:
print(f"Error clicking a 'See More' button: {e}")
# Click the "See More" buttons
click_see_more_buttons()
# Get page source
page_source = driver.page_source
# Constructing the MHTML content manually
mhtml_content = f"""MIME-Version: 1.0
Content-Type: multipart/related; boundary="----=_NextPart_000_0000_01D4E1C0.CE6AA5F0"
This document is a Single File Web Page, also known as a Web Archive file.
------=_NextPart_000_0000_01D4E1C0.CE6AA5F0
Content-Location: {IMDB_URL}
Content-Transfer-Encoding: quoted-printable
Content-Type: text/html; charset=UTF-8
{page_source}
------=_NextPart_000_0000_01D4E1C0.CE6AA5F0--
"""
# Write the MHTML data to the specified file path
with open(SAVE_PATH, "w", encoding="utf-8") as file:
file.write(mhtml_content)
# Wait to ensure the file is saved
time.sleep(2) # Adjust the sleep time if necessary
# Confirm file creation
if os.path.exists(SAVE_PATH):
print(f"Page saved successfully to {SAVE_PATH}")
else:
print(f"Failed to save the page to {SAVE_PATH}")
# Close the browser
driver.quit()
import threading
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
# Paths
CHROME_DRIVER_PATH = r"Q:\Portableapps\ChromeDriver-win64\chromedriver.exe"
CHROME_BINARY_PATH = r"Q:\Portableapps\GoogleChromePortable64\App\Chrome-bin\chrome.exe"
# IMDb URLs and their corresponding save paths
URLS_AND_PATHS = {
"https://www.imdb.com/name/nm0190859/awards/ (https://www.imdb.com/name/nm0190859/awards/)": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Awards.mhtml",
"https://www.imdb.com/name/nm0190859/ (https://www.imdb.com/name/nm0190859/)": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Main.mhtml",
"https://www.imdb.com/name/nm0190859/bio/ (https://www.imdb.com/name/nm0190859/bio/)": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Bio.mhtml",
"https://www.imdb.com/search/title/?explore=genres&role=nm0190859 (https://www.imdb.com/search/title/?explore=genres&role=nm0190859)": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Genres.html", # Change to .html for classic HTML
"https://www.imdb.com/name/nm0190859/?showAllCredits=true (https://www.imdb.com/name/nm0190859/?showAllCredits=true)": r"Q:\Portableapps\PersonalVideoDB\Scripts\Tmp\UTF8_NO_BOM-Credit.mhtml"
}
# Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = CHROME_BINARY_PATH # Specify the Chrome binary location
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu") # Disable GPU for headless mode stability
chrome_options.add_argument("--headless") # Running Chrome in headless mode
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# Function to download a page
def download_page(IMDB_URL, SAVE_PATH):
# Service object for ChromeDriver
service = Service(executable_path=CHROME_DRIVER_PATH)
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)
# Add a custom cookie
driver.get("https://www.imdb.com (https://www.imdb.com)") # Open the base URL to set the cookie
cookie = {'name': 'example_cookie', 'value': 'example_value', 'domain': 'imdb.com'}
driver.add_cookie(cookie)
# Navigate to the IMDb page
driver.get(IMDB_URL)
# Wait for the page to fully load and specific element to ensure all content is loaded
try:
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.ipc-title__text"))
)
except Exception as e:
print(f"Error waiting for the page to load: {e}")
# Special handling for the awards page
if "awards" in IMDB_URL:
# Function to find and click each "See More" button one by one
def click_see_more_buttons():
while True:
buttons = driver.find_elements(By.XPATH, "//span[@class='ipc-btn__text']//span[contains(@class, 'ipc-see-more__text')]")
if not buttons:
break
for button in buttons:
try:
driver.execute_script("arguments[0].scrollIntoView(true);", button) # Scroll to the button to ensure its visible
time.sleep(1)
button.click()
print(f"Clicked a 'See More' button.")
time.sleep(1) # Short delay to ensure the click is registered
except Exception as e:
print(f"Error clicking a 'See More' button: {e}")
# Click the "See More" buttons
click_see_more_buttons()
# Get page source
page_source = driver.page_source
# Save page source as MHTML or HTML
if IMDB_URL == "https://www.imdb.com/search/title/?explore=genres&role=nm0190859 (https://www.imdb.com/search/title/?explore=genres&role=nm0190859)":
# Save as classic HTML
with open(SAVE_PATH, "w", encoding="utf-8") as file:
file.write(page_source)
print(f"Page saved as HTML to {SAVE_PATH}")
else:
# Constructing the MHTML content manually
mhtml_content = f"""MIME-Version: 1.0
Content-Type: multipart/related; boundary="----=_NextPart_000_0000_01D4E1C0.CE6AA5F0"
This document is a Single File Web Page, also known as a Web Archive file.
------=_NextPart_000_0000_01D4E1C0.CE6AA5F0
Content-Location: {IMDB_URL}
Content-Transfer-Encoding: quoted-printable
Content-Type: text/html; charset=UTF-8
{page_source}
------=_NextPart_000_0000_01D4E1C0.CE6AA5F0--
"""
# Write the MHTML data to the specified file path
with open(SAVE_PATH, "w", encoding="utf-8") as file:
file.write(mhtml_content)
print(f"Page saved as MHTML to {SAVE_PATH}")
# Wait to ensure the file is saved
time.sleep(2) # Adjust the sleep time if necessary
# Confirm file creation
if os.path.exists(SAVE_PATH):
print(f"Page saved successfully to {SAVE_PATH}")
else:
print(f"Failed to save the page to {SAVE_PATH}")
# Close the browser
driver.quit()
# Create and start threads for each URL
threads = []
for url, save_path in URLS_AND_PATHS.items():
thread = threading.Thread(target=download_page, args=(url, save_path))
threads.append(thread)
thread.start()
# Wait for all threads to complete
for thread in threads:
thread.join()
print("All pages have been saved successfully.")