Seleniumで自動画像収集

2023年4月6日2023年10月13日

Google検索とPython Seleniumライブラリで自動画像収集を組んでみました。
時間をかけずに大量のデータを取得できます。

■適用範囲
機械学習等で画像サンプルデータ収集

■ポイントとなるドライバ、ライブラリ
pip install chromedriver_binary
OSError: [Errno 8] Exec format error: 'chromedriver’
⇢sudo apt-get install chromium-chromedriver　#ブラウザがchromiumのため、必要
AttributeError: 'WebDriver’ object has no attribute 'find_elements_by_css_selector’
⇢pip install selenium==3.141.0　＃バージョン指定

■サンプルプログラム（ねこの画像を/homeへ保存する場合）
#!/usr/bin/env python
# coding: utf-8
import chromedriver_binary
import os
import time
from selenium import webdriver
from PIL import Image
import io
import requests
import hashlib
# クリックなど動作後に待つ時間(秒)
sleep_between_interactions = 3
# ダウンロードする枚数
download_num = 50
# 検索ワード
query = “ねこ"
# 画像検索用のurl
#search_url = “https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
search_url = “https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
search_url
# サムネイル画像のURL取得
#wd = webdriver.Chrome(executable_path=DRIVER_PATH)
# webブラウザを起動
wd = webdriver.Chrome()
wd.get(search_url.format(q=query))
# サムネイル画像のリンクを取得(ここでコケる場合はセレクタを実際に確認して変更する)
thumbnail_results = wd.find_elements_by_css_selector(“img.rg_i")
# サムネイルをクリックして、各画像URLを取得
image_urls = set()
for img in thumbnail_results[:download_num]:
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
# 一発でurlを取得できないので、候補を出してから絞り込む(やり方あれば教えて下さい)
# 'n3VNCb’は変更されることあるので、クリックした画像のエレメントをみて適宜変更する
url_candidates = wd.find_elements_by_class_name('n3VNCb’)
for candidate in url_candidates:
url = candidate.get_attribute('src’)
if url and 'https’ in url:
image_urls.add(url)
# 少し待たないと正常終了しなかったので3秒追加
time.sleep(sleep_between_interactions+4)
wd.quit()
# 画像のダウンロード
image_save_folder_path = '/home’
for url in image_urls:
try:
image_content = requests.get(url).content
except Exception as e:
print(f"ERROR – Could not download {url} – {e}")
try:
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB’)
file_path = os.path.join(image_save_folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg’)
with open(file_path, 'wb’) as f:
image.save(f, “JPEG", quality=90)
print(f"SUCCESS – saved {url} – as {file_path}")
except Exception as e:
print(f"ERROR – Could not save {url} – {e}")