Spaces:

bk939448
/

Still_frame

Running

App Files Files Community

Still_frame / app.py

bk939448

Update app.py

97ef3a7 verified about 1 month ago

raw

history blame contribute delete

6.87 kB

	import os
	import httpx
	from fastapi import FastAPI, Form, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from typing import Optional, List
	import asyncio
	import uvicorn
	import pytesseract
	from PIL import Image
	import io
	import re

	# Tesseract का Linux पाथ
	pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

	app = FastAPI(title="TMDB + OCR Pro API \| Badal Special")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	OPTIPIX_API = "https://bk939448-image-optimizer-api.hf.space/upload-poster"
	TMDB_API_KEY = os.getenv("TMDB_API_KEY")

	class ImageMedia(BaseModel):
	original_url: str
	processed_url: Optional[str]

	class ProcessResponse(BaseModel):
	title_id: str
	tmdb_id: int
	requested_shots: int
	total_screenshots_scanned: int
	poster: Optional[ImageMedia]
	screenshots: List[ImageMedia]

	# --- 1. OCR Scanner Function ---
	def check_text_in_image(image_bytes: bytes) -> bool:
	try:
	img = Image.open(io.BytesIO(image_bytes))
	img.thumbnail((500, 500)) # फ़ास्ट स्कैनिंग के लिए छोटा करना
	img = img.convert('L') # ब्लैक एंड वाइट

	# इंग्लिश, हिंदी और तेलुगु स्कैन
	text = pytesseract.image_to_string(img, lang='eng+hin+tel')

	# सिर्फ़ शब्द और नंबर रखना
	clean_text = re.sub(r'[^a-zA-Z0-9\u0900-\u097F\u0C00-\u0C7F]', '', text)

	# अगर 4 कैरेक्टर से ज़्यादा टेक्स्ट है, तो यह स्क्रीनशॉट नहीं, पोस्टर है (True)
	return len(clean_text) > 4
	except Exception as e:
	print(f"OCR Parsing Error: {e}")
	return True # रिस्क नहीं लेने का, रिजेक्ट कर दो!

	# --- 2. Parallel OptiPix Function ---
	async def optimize_image(client: httpx.AsyncClient, raw_url: str, level: str):
	form_data = {"level": level, "url": raw_url}
	result = {"original_url": raw_url, "processed_url": None}
	try:
	res = await client.post(OPTIPIX_API, data=form_data, timeout=30.0)
	data = res.json()
	if data.get("success"):
	result["processed_url"] = data.get("url")
	except Exception as e:
	print(f"OptiPix failed for {raw_url} - Error: {e}")
	return result

	@app.post("/get-media", response_model=ProcessResponse)
	async def get_media(
	title_id: str = Form(..., description="IMDb Title ID (e.g., tt3801314)"),
	top_shots: int = Form(3, description="Number of screenshots required"),
	level: str = Form("extreme", description="Compression level")
	):
	if not TMDB_API_KEY:
	raise HTTPException(status_code=500, detail="TMDB_API_KEY is missing!")

	async with httpx.AsyncClient(timeout=120.0) as client:
	# --- STEP 1: TMDb ID ढूँढना ---
	find_url = f"https://api.themoviedb.org/3/find/{title_id}?external_source=imdb_id&api_key={TMDB_API_KEY}"
	find_res = await client.get(find_url)
	find_data = find_res.json()

	movie_results = find_data.get("movie_results", [])
	if not movie_results:
	return {"error": "TMDb पर इस IMDb ID की कोई मूवी नहीं मिली!"}

	tmdb_id = movie_results[0]["id"]

	# --- STEP 2: TMDb से इमेजेज लाना ---
	images_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}/images?api_key={TMDB_API_KEY}"
	img_res = await client.get(images_url)
	img_data = img_res.json()

	raw_backdrops = img_data.get("backdrops", [])
	raw_posters = img_data.get("posters", [])

	# 🔥 SMART HACK: सिर्फ़ वो बैकड्रॉप्स लो जिनमें लैंग्वेज 'null' हो
	clean_backdrops = [shot for shot in raw_backdrops if shot.get("iso_639_1") is None]
	clean_backdrops.sort(key=lambda x: x["width"], reverse=True)

	# --- STEP 3: Poster निकालना ---
	best_poster_url = None
	if raw_posters:
	raw_posters.sort(key=lambda x: x["width"], reverse=True)
	best_poster_url = f"https://image.tmdb.org/t/p/original{raw_posters[0]['file_path']}"

	# --- STEP 4: HARDCORE OCR SCANNING ---
	verified_screenshots_urls = []
	for shot in clean_backdrops:
	if len(verified_screenshots_urls) >= top_shots:
	break # ज़रूरत पूरी हो गई, रुक जाओ

	shot_url = f"https://image.tmdb.org/t/p/original{shot['file_path']}"

	try:
	# इमेज डाउनलोड करके OCR को दो
	img_res_dl = await client.get(shot_url, timeout=10.0)
	if img_res_dl.status_code == 200:
	# Async में OCR चलाओ ताकि सर्वर हैंग न हो
	has_text = await asyncio.to_thread(check_text_in_image, img_res_dl.content)

	if not has_text: # अगर टेक्स्ट नहीं है, तो पास!
	verified_screenshots_urls.append(shot_url)
	print(f"Clean Screenshot Passed OCR: {shot_url}")
	else:
	print(f"Rejected by OCR (Text Found): {shot_url}")
	except Exception as e:
	print(f"Image download error for OCR: {e}")
	continue

	# --- STEP 5: पैरेलल ऑप्टिमाइज़ेशन (OptiPix) ---
	tasks = []
	if best_poster_url:
	tasks.append(optimize_image(client, best_poster_url, level))

	for url in verified_screenshots_urls:
	tasks.append(optimize_image(client, url, level))

	results = await asyncio.gather(*tasks)

	final_poster = None
	final_screenshots = []

	if best_poster_url and results:
	final_poster = results[0]
	final_screenshots = results[1:]
	else:
	final_screenshots = results

	return ProcessResponse(
	title_id=title_id,
	tmdb_id=tmdb_id,
	requested_shots=top_shots,
	total_screenshots_scanned=len(clean_backdrops),
	poster=final_poster,
	screenshots=final_screenshots
	)

	if __name__ == "__main__":
	uvicorn.run("app:app", host="0.0.0.0", port=7860)