Python script to scrape images and metadata from a website for upload to a mediawiki

From Squirrel's Lair


  • Cargo:


  • Categories:
  • Default form


This is the code that was used to capture the images and metadata for upload to the Offline Museum Kiosk With MediaWiki and Raspberry Pi using Python script to upload images and descriptions to a mediawiki. It writes the metadata to a .json file. The files will be in in the downloads directory. You would need to modify this script to use it elsewhere by changing the URL. You would also need to change the document identifiers; the "inspect" tool in firefox will tell you names of identifiers on a page.

import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import os
import time
import json

url = "https://<put your URL here>"

Image_metadata = []                     # start the data structure that will contain the images and their metadata 

# Load the web page with selenium
driver = webdriver.Edge()               # launch the Chrome browser
driver.implicitly_wait(20)              # tell driver how long to wait to find things ie load pages
driver.get(url)                         # navigate to the web page

# change the dropdown to list 200 images to show all
dropdown_element = driver.find_element("id", "desktopResultsPerPageSelector")
                                        # find the dropdown element
dropdown = Select(dropdown_element)     # turn it into a select element
dropdown.select_by_visible_text('200')  # select the desired option by value

search_results = driver.find_elements(By.CLASS_NAME, "SearchResult-container")
                                        # get a list of all results

# get all data from search results into Image_metadata list
for index, item in enumerate(search_results):
    img = {"URL": item.get_attribute("href")}
    img["title"] = item.find_element(By.CLASS_NAME, "MetadataFields-header").text
    #print (index, img)
    Image_metadata.append(img)

# iterate through Images to get remaining data from image pages
for index, img in enumerate(Image_metadata):
    print(index)
    
    driver.get(img["URL"])              # navigate to the image page
    
    meta_element = driver.find_element(By.CLASS_NAME, "field-descri")
    title_element = meta_element.find_element(By.CLASS_NAME, "field-value")
    img["description"] = title_element.text
    
    meta_element = driver.find_element(By.CLASS_NAME, "field-creato")
    title_element = meta_element.find_element(By.CLASS_NAME, "field-value")
    img["creator"] = title_element.text
    
    meta_element = driver.find_element(By.CLASS_NAME, "field-date")
    title_element = meta_element.find_element(By.CLASS_NAME, "field-value")
    img["date"] = title_element.text
    
    # download the picture
    xlg=driver.find_element(By.ID, "downloadsizemenu-side-bar")
                                        # find the element that contains img size download buttons
    driver.execute_script("arguments[0].style.display = 'block';", xlg)
                                        # unblock it so it can be clicked
    xlg = driver.find_element(By.LINK_TEXT, "Extra Large")
                                        # find the "Extra Large" link
    xlg.click()                         # click the link to start download
    time.sleep(3)                       # wait for download to finish

    # get the ID for the picture to be able to reference download later.
    # the name of the file I am downloading... downloads as p15931coll2_3118_extralarge.jpg

    xlg.get_attribute("href")           # gets the URL
   
    x = xlg.get_attribute("data-metrics-event-label")
                                        # which conctains the id...

    # will need to find "/id/" in one of these strings and grab the stuff between that and the next /
    x_parts = x.split("/")              # split the string at "/"s
    for i in range(len(x_parts)):
        if x_parts[i].endswith("id"):   # find the part that's "ID"
            img["ID"] = x_parts[i+1]           # the next part is the id
            break

print(Image_metadata)

with open('C:\\Users\\Tina\\Desktop\\Image_metadata.json', 'w') as f:
    json.dump(Image_metadata, f)
    
driver.quit() # quit the browser
print("program complete")