clementBE commited on
Commit
c5aba08
·
verified ·
1 Parent(s): ae8813c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ from selenium import webdriver
4
+ from bs4 import BeautifulSoup
5
+ from selenium.webdriver.chrome.options import Options
6
+ from fastapi import FastAPI, Response, FileResponse, HTTPException
7
+ from fastapi.staticfiles import StaticFiles
8
+ import os
9
+
10
+ # Initialize the FastAPI app
11
+ app = FastAPI()
12
+
13
+ def getimage(url: str) -> str:
14
+ """
15
+ Scrapes the profile image from a given URL using Selenium and BeautifulSoup,
16
+ and saves it to the local filesystem.
17
+ """
18
+
19
+ # 1. Setup Selenium Options
20
+ chrome_options = Options()
21
+ # REQUIRED for deployment on servers like Hugging Face Spaces or Docker
22
+ chrome_options.add_argument('--headless')
23
+ chrome_options.add_argument('--no-sandbox')
24
+ chrome_options.add_argument('--disable-dev-shm-usage')
25
+ chrome_options.add_argument("--window-size=1200x800")
26
+
27
+ driver = None
28
+ try:
29
+ # 2. Initialize the WebDriver
30
+ driver = webdriver.Chrome(options=chrome_options)
31
+
32
+ # 3. Navigate and Wait
33
+ driver.get(url)
34
+ # Wait long enough for the dynamic content (profile picture) to load
35
+ time.sleep(5)
36
+ page_source = driver.page_source
37
+
38
+ # 4. Parse the Source
39
+ soup = BeautifulSoup(page_source, 'html.parser')
40
+
41
+ # 5. Targeted Thumbnail/Profile Picture Selection Logic
42
+ # Strategy: Search for an image with 'alt' text related to the profile
43
+ def is_profile_image(tag):
44
+ alt_text = tag.get('alt', '').lower()
45
+ # Common alt texts used for the main profile picture
46
+ return tag.name == 'img' and ('profile picture' in alt_text or 'avatar' in alt_text)
47
+
48
+ img_tag = soup.find(is_profile_image)
49
+
50
+ # Fallback Strategy: If the profile-specific search fails, take the largest available image
51
+ if not img_tag:
52
+ print("Fallback to finding the first image with a 'src' attribute.")
53
+ img_tag = soup.find('img', src=True)
54
+
55
+ if not img_tag:
56
+ raise ValueError("Could not find a suitable image tag on the page.")
57
+
58
+ img_url = img_tag['src']
59
+
60
+ # 6. Download the Image
61
+ r = requests.get(img_url, stream=True)
62
+ r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
63
+
64
+ filename = "instagram_profile.png"
65
+ with open(filename, 'wb') as f:
66
+ for chunk in r.iter_content(chunk_size=8192):
67
+ f.write(chunk)
68
+
69
+ return filename
70
+
71
+ except Exception as e:
72
+ # Clean up the browser instance in case of an error
73
+ raise RuntimeError(f"Scraping failed for URL {url}: {e}") from e
74
+ finally:
75
+ if driver:
76
+ driver.quit()
77
+
78
+
79
+ # --- FastAPI Endpoints ---
80
+
81
+ # Endpoint to trigger the image scraping
82
+ @app.get("/fetch_profile_image")
83
+ def fetch_image_endpoint(input_url: str):
84
+ """
85
+ Accepts a URL, scrapes the profile image, and returns the result.
86
+ """
87
+ if not input_url.startswith("http"):
88
+ raise HTTPException(status_code=400, detail="Input must be a valid URL starting with http:// or https://")
89
+
90
+ try:
91
+ saved_filename = getimage(input_url)
92
+
93
+ # We can also return the image itself, but for simplicity,
94
+ # we'll confirm the file was saved.
95
+ return {
96
+ "status": "success",
97
+ "message": f"Profile picture successfully caught and saved as {saved_filename}",
98
+ "filename": saved_filename
99
+ }
100
+ except Exception as e:
101
+ raise HTTPException(status_code=500, detail=str(e))
102
+
103
+ # This part serves the static files (like a frontend HTML page)
104
+ # Note: You would need a 'static' folder with an 'index.html' file to see a UI.
105
+ app.mount("/", StaticFiles(directory="static", html=True), name="static")
106
+
107
+ # The root endpoint serves the main HTML page
108
+ @app.get("/")
109
+ def index() -> FileResponse:
110
+ # Ensure the path exists, otherwise the app will fail to start
111
+ if os.path.exists("static/index.html"):
112
+ return FileResponse(path="static/index.html", media_type="text/html")
113
+ else:
114
+ # If running without a UI, just return a simple message
115
+ return {"message": "Image Scraper API Running. Access /fetch_profile_image?input_url=<URL> to test."}