sreepathi-ravikumar commited on
Commit
dc55118
·
verified ·
1 Parent(s): 4b4e476

Update image_fetcher.py

Browse files
Files changed (1) hide show
  1. image_fetcher.py +49 -52
image_fetcher.py CHANGED
@@ -1,73 +1,70 @@
1
-
2
  import os
3
- import time
 
4
  import random
5
- import requests
6
  from PIL import Image
7
  from io import BytesIO
8
  from duckduckgo_search import DDGS
 
9
 
10
- # Create folders
11
- DIRS = {
12
- "base": "images",
13
- "temp": os.path.join("images", "tmp")
14
- }
15
-
16
- # Ensure directories exist
17
- path = os.path.join(os.getcwd(), 'images')
18
- os.makedirs(path, exist_ok=True)
19
 
20
- # Helper: Random delay + headers
21
  def get_headers():
22
  user_agents = [
23
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
24
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
25
  "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0)"
26
  ]
27
- return {'User-Agent': random.choice(user_agents)}
28
 
29
- # Check if image meets requirements
30
- def is_valid_image(img):
31
- width, height = img.size
32
- ratio = round(width / height, 2)
33
- return width >= 854 and height >= 480 and abs(ratio - (16 / 9)) <= 0.2 # relaxed ratio check
34
 
35
- # Try to download and validate image
36
- def validate_image(img_url, keyword):
37
  try:
38
- response = requests.get(img_url, headers=get_headers(), timeout=10)
39
- img = Image.open(BytesIO(response.content)).convert("RGB")
40
- if not is_valid_image(img):
41
- print(f"Skipped (not valid): {img_url}")
42
- return False
43
- final_path = os.path.join("images", f"{keyword}.jpg")
44
- img.save(final_path)
45
- print(f"Saved: {final_path} | Size: {img.size}")
46
- return True
47
- except Exception as e:
48
- print(f"Failed to validate {img_url}: {e}")
49
- return False
50
 
51
- # Search and fetch images from DuckDuckGo
52
- def fetch_images_from_prompts(prompt_list):
53
- for keyword in prompt_list:
54
- safe_keyword = keyword.replace(" ", "_").lower()
55
- print(f"\nSearching for: {keyword}")
56
- found = False
57
 
58
- try:
59
- with DDGS() as ddgs:
60
- results = ddgs.images(keyword, max_results=20)
61
- for result in results:
62
- if validate_image(result['image'], safe_keyword):
63
- found = True
64
- break
65
- time.sleep(random.uniform(1.5, 3.5)) # anti-bot delay
66
 
67
- except Exception as e:
68
- print(f"Rate limit or error: {e}. Sleeping 10 seconds...")
69
- time.sleep(10) # longer wait on rate limit
70
 
71
- if not found:
72
- print(f"No suitable image found for: {keyword}")
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import asyncio
3
+ import aiohttp
4
  import random
 
5
  from PIL import Image
6
  from io import BytesIO
7
  from duckduckgo_search import DDGS
8
+ import time
9
 
10
+ # Setup /tmp directory for Hugging Face
11
+ IMAGE_DIR = "/tmp/images"
12
+ os.makedirs(IMAGE_DIR, exist_ok=True)
 
 
 
 
 
 
13
 
14
+ # Headers
15
  def get_headers():
16
  user_agents = [
17
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
18
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
19
  "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0)"
20
  ]
21
+ return {"User-Agent": random.choice(user_agents)}
22
 
23
+ # Image validation
24
+ def is_valid_image(image):
25
+ width, height = image.size
26
+ aspect_ratio = round(width / height, 2)
27
+ return width >= 854 and height >= 480 and abs(aspect_ratio - (16 / 9)) <= 0.2
28
 
29
+ # Async fetch image
30
+ async def fetch_image(session, url, name):
31
  try:
32
+ async with session.get(url, timeout=10) as response:
33
+ content = await response.read()
34
+ image = Image.open(BytesIO(content)).convert("RGB")
 
 
 
 
 
 
 
 
 
35
 
36
+ if not is_valid_image(image):
37
+ return f"Skipped (invalid): {name}"
 
 
 
 
38
 
39
+ unique_name = f"{name}_{int(time.time() * 1000)}.jpg"
40
+ path = os.path.join(IMAGE_DIR, unique_name)
41
+ image.save(path)
42
+ return f"Saved: {unique_name}"
 
 
 
 
43
 
44
+ except Exception as e:
45
+ return f"Error: {name} | {e}"
 
46
 
47
+ # Async search + download with DDGS inside thread
48
+ async def search_and_download(session, prompt, sem):
49
+ async with sem:
50
+ name = prompt.replace(" ", "_").lower()
51
+ try:
52
+ loop = asyncio.get_event_loop()
53
+ results = await loop.run_in_executor(None, lambda: list(DDGS().images(prompt, max_results=15)))
54
+ for item in results:
55
+ url = item.get("image")
56
+ result = await fetch_image(session, url, name)
57
+ if "Saved" in result:
58
+ return result
59
+ return f"No valid image for: {prompt}"
60
+ except Exception as e:
61
+ return f"Search failed for {prompt}: {e}"
62
 
63
+ # Main runner
64
+ async def main(prompts):
65
+ sem = asyncio.Semaphore(5) # Limit concurrency
66
+ async with aiohttp.ClientSession(headers=get_headers()) as session:
67
+ tasks = [search_and_download(session, prompt, sem) for prompt in prompts]
68
+ results = await asyncio.gather(*tasks)
69
+ for res in results:
70
+ print(res)