Python script to scrape images and metadata from a website for upload to a mediawiki
This is the code that was used to capture the images and metadata for upload to the Offline Museum Kiosk With MediaWiki and Raspberry Pi using Python script to upload images and descriptions to a mediawiki. It writes the metadata to a .json file. The files will be in in the downloads directory. You would need to modify this script to use it elsewhere by changing the URL. You would also need to change the document identifiers; the "inspect" tool in firefox will tell you names of identifiers on a page.
import requests from selenium import webdriver from selenium.webdriver.support.ui import Select from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys import os import time import json url = "https://<put your URL here>" Image_metadata = [] # start the data structure that will contain the images and their metadata # Load the web page with selenium driver = webdriver.Edge() # launch the Chrome browser driver.implicitly_wait(20) # tell driver how long to wait to find things ie load pages driver.get(url) # navigate to the web page # change the dropdown to list 200 images to show all dropdown_element = driver.find_element("id", "desktopResultsPerPageSelector") # find the dropdown element dropdown = Select(dropdown_element) # turn it into a select element dropdown.select_by_visible_text('200') # select the desired option by value search_results = driver.find_elements(By.CLASS_NAME, "SearchResult-container") # get a list of all results # get all data from search results into Image_metadata list for index, item in enumerate(search_results): img = {"URL": item.get_attribute("href")} img["title"] = item.find_element(By.CLASS_NAME, "MetadataFields-header").text #print (index, img) Image_metadata.append(img) # iterate through Images to get remaining data from image pages for index, img in enumerate(Image_metadata): print(index) driver.get(img["URL"]) # navigate to the image page meta_element = driver.find_element(By.CLASS_NAME, "field-descri") title_element = meta_element.find_element(By.CLASS_NAME, "field-value") img["description"] = title_element.text meta_element = driver.find_element(By.CLASS_NAME, "field-creato") title_element = meta_element.find_element(By.CLASS_NAME, "field-value") img["creator"] = title_element.text meta_element = driver.find_element(By.CLASS_NAME, "field-date") title_element = meta_element.find_element(By.CLASS_NAME, "field-value") img["date"] = title_element.text # download the picture xlg=driver.find_element(By.ID, "downloadsizemenu-side-bar") # find the element that contains img size download buttons driver.execute_script("arguments[0].style.display = 'block';", xlg) # unblock it so it can be clicked xlg = driver.find_element(By.LINK_TEXT, "Extra Large") # find the "Extra Large" link xlg.click() # click the link to start download time.sleep(3) # wait for download to finish # get the ID for the picture to be able to reference download later. # the name of the file I am downloading... downloads as p15931coll2_3118_extralarge.jpg xlg.get_attribute("href") # gets the URL x = xlg.get_attribute("data-metrics-event-label") # which conctains the id... # will need to find "/id/" in one of these strings and grab the stuff between that and the next / x_parts = x.split("/") # split the string at "/"s for i in range(len(x_parts)): if x_parts[i].endswith("id"): # find the part that's "ID" img["ID"] = x_parts[i+1] # the next part is the id break print(Image_metadata) with open('C:\\Users\\Tina\\Desktop\\Image_metadata.json', 'w') as f: json.dump(Image_metadata, f) driver.quit() # quit the browser print("program complete")