import os import httpx from fastapi import FastAPI, Form, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import Optional, List import asyncio import uvicorn import pytesseract from PIL import Image import io import re # Tesseract का Linux पाथ pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' app = FastAPI(title="TMDB + OCR Pro API | Badal Special") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) OPTIPIX_API = "https://bk939448-image-optimizer-api.hf.space/upload-poster" TMDB_API_KEY = os.getenv("TMDB_API_KEY") class ImageMedia(BaseModel): original_url: str processed_url: Optional[str] class ProcessResponse(BaseModel): title_id: str tmdb_id: int requested_shots: int total_screenshots_scanned: int poster: Optional[ImageMedia] screenshots: List[ImageMedia] # --- 1. OCR Scanner Function --- def check_text_in_image(image_bytes: bytes) -> bool: try: img = Image.open(io.BytesIO(image_bytes)) img.thumbnail((500, 500)) # फ़ास्ट स्कैनिंग के लिए छोटा करना img = img.convert('L') # ब्लैक एंड वाइट # इंग्लिश, हिंदी और तेलुगु स्कैन text = pytesseract.image_to_string(img, lang='eng+hin+tel') # सिर्फ़ शब्द और नंबर रखना clean_text = re.sub(r'[^a-zA-Z0-9\u0900-\u097F\u0C00-\u0C7F]', '', text) # अगर 4 कैरेक्टर से ज़्यादा टेक्स्ट है, तो यह स्क्रीनशॉट नहीं, पोस्टर है (True) return len(clean_text) > 4 except Exception as e: print(f"OCR Parsing Error: {e}") return True # रिस्क नहीं लेने का, रिजेक्ट कर दो! # --- 2. Parallel OptiPix Function --- async def optimize_image(client: httpx.AsyncClient, raw_url: str, level: str): form_data = {"level": level, "url": raw_url} result = {"original_url": raw_url, "processed_url": None} try: res = await client.post(OPTIPIX_API, data=form_data, timeout=30.0) data = res.json() if data.get("success"): result["processed_url"] = data.get("url") except Exception as e: print(f"OptiPix failed for {raw_url} - Error: {e}") return result @app.post("/get-media", response_model=ProcessResponse) async def get_media( title_id: str = Form(..., description="IMDb Title ID (e.g., tt3801314)"), top_shots: int = Form(3, description="Number of screenshots required"), level: str = Form("extreme", description="Compression level") ): if not TMDB_API_KEY: raise HTTPException(status_code=500, detail="TMDB_API_KEY is missing!") async with httpx.AsyncClient(timeout=120.0) as client: # --- STEP 1: TMDb ID ढूँढना --- find_url = f"https://api.themoviedb.org/3/find/{title_id}?external_source=imdb_id&api_key={TMDB_API_KEY}" find_res = await client.get(find_url) find_data = find_res.json() movie_results = find_data.get("movie_results", []) if not movie_results: return {"error": "TMDb पर इस IMDb ID की कोई मूवी नहीं मिली!"} tmdb_id = movie_results[0]["id"] # --- STEP 2: TMDb से इमेजेज लाना --- images_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}/images?api_key={TMDB_API_KEY}" img_res = await client.get(images_url) img_data = img_res.json() raw_backdrops = img_data.get("backdrops", []) raw_posters = img_data.get("posters", []) # 🔥 SMART HACK: सिर्फ़ वो बैकड्रॉप्स लो जिनमें लैंग्वेज 'null' हो clean_backdrops = [shot for shot in raw_backdrops if shot.get("iso_639_1") is None] clean_backdrops.sort(key=lambda x: x["width"], reverse=True) # --- STEP 3: Poster निकालना --- best_poster_url = None if raw_posters: raw_posters.sort(key=lambda x: x["width"], reverse=True) best_poster_url = f"https://image.tmdb.org/t/p/original{raw_posters[0]['file_path']}" # --- STEP 4: HARDCORE OCR SCANNING --- verified_screenshots_urls = [] for shot in clean_backdrops: if len(verified_screenshots_urls) >= top_shots: break # ज़रूरत पूरी हो गई, रुक जाओ shot_url = f"https://image.tmdb.org/t/p/original{shot['file_path']}" try: # इमेज डाउनलोड करके OCR को दो img_res_dl = await client.get(shot_url, timeout=10.0) if img_res_dl.status_code == 200: # Async में OCR चलाओ ताकि सर्वर हैंग न हो has_text = await asyncio.to_thread(check_text_in_image, img_res_dl.content) if not has_text: # अगर टेक्स्ट नहीं है, तो पास! verified_screenshots_urls.append(shot_url) print(f"Clean Screenshot Passed OCR: {shot_url}") else: print(f"Rejected by OCR (Text Found): {shot_url}") except Exception as e: print(f"Image download error for OCR: {e}") continue # --- STEP 5: पैरेलल ऑप्टिमाइज़ेशन (OptiPix) --- tasks = [] if best_poster_url: tasks.append(optimize_image(client, best_poster_url, level)) for url in verified_screenshots_urls: tasks.append(optimize_image(client, url, level)) results = await asyncio.gather(*tasks) final_poster = None final_screenshots = [] if best_poster_url and results: final_poster = results[0] final_screenshots = results[1:] else: final_screenshots = results return ProcessResponse( title_id=title_id, tmdb_id=tmdb_id, requested_shots=top_shots, total_screenshots_scanned=len(clean_backdrops), poster=final_poster, screenshots=final_screenshots ) if __name__ == "__main__": uvicorn.run("app:app", host="0.0.0.0", port=7860)