Spaces:
Running
Running
| import logging | |
| import os | |
| import re | |
| from typing import Any, Dict, List, Optional, Union | |
| from fastapi import FastAPI, HTTPException, Request | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| import httpx | |
| # --- SETUP --- | |
| app = FastAPI(title="Gemini Smart Proxy") | |
| # Handle CORS | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["POST", "OPTIONS"], | |
| allow_headers=["Content-Type"], | |
| ) | |
| # Logger setup | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # --- CLEANING HELPER --- | |
| def clean_input(text: str) -> str: | |
| """ | |
| Removes all whitespace and invisible characters. | |
| Only allows: a-z, A-Z, 0-9, dash (-), underscore (_), and dot (.) | |
| """ | |
| if not text: | |
| return "" | |
| s = str(text).strip() | |
| return re.sub(r'[^a-zA-Z0-9_.-]', '', s) | |
| # --- SYSTEM PROMPTS --- | |
| PROMPTS = { | |
| "VERIFY_SINGLE": """You are an advanced subtitle verifier and corrector. You will be given a single pair of data: an original SRT block and an IMAGE of a corresponding PDF page. The image is the ground truth. | |
| Your task is to perform two actions and return the result as a single JSON object. | |
| Your response MUST be a valid JSON object that strictly follows this schema: | |
| { | |
| "type": "OBJECT", | |
| "properties": { | |
| "errorReport": { "type": "STRING" }, | |
| "correctedSrt": { "type": "STRING" } | |
| }, | |
| "required": ["errorReport", "correctedSrt"] | |
| } | |
| INSTRUCTIONS FOR EACH JSON KEY: | |
| 1. **"errorReport"**: Perform OCR on the image. Report ONLY significant errors (mismatch, extraneous/missing text). If none, this string MUST be "No significant errors found.". | |
| 2. **"correctedSrt"**: Generate a corrected SRT block for the current subtitle pair. | |
| All timestamps must remain exactly as in the original, and the output must contain only the raw SRT text — no explanations, no JSON, no metadata. | |
| Guidelines: | |
| 1. Source Priority: | |
| • Use the OCR image text as the primary source of truth. | |
| • If the OCR result contains Burmese text, use it as the main subtitle text. | |
| 2. Language Inclusion: | |
| • Keep both Burmese and English lines if they exist. | |
| • Do not include any other languages besides Burmese and English. | |
| 3. When OCR Text Is Missing: | |
| • If the OCR image contains no readable text, keep the timestamps exactly the same and output a blank subtitle line. | |
| • Do not delete, skip, or merge any subtitle blocks. | |
| 4. Preservation Rules: | |
| • Always preserve original timestamps, line breaks, and Burmese punctuation (။, ၊). | |
| • Maintain the same block numbering sequence as in the input. | |
| 5. Output Format: | |
| • Output only the clean, corrected SRT block. | |
| • No explanations, no quotes, no formatting outside the SRT syntax.""", | |
| "VERIFY_BATCH": """You are an advanced subtitle verifier and corrector. You will be given a BATCH of data containing several pairs of an original SRT block and a corresponding PDF page IMAGE. The image is the ground truth. | |
| Your task is to process EACH PAIR sequentially and return the result as a single JSON ARRAY, where each object in the array corresponds to a pair from the input. | |
| Your response MUST be a valid JSON array that strictly follows this schema: | |
| { | |
| "type": "ARRAY", | |
| "items": { | |
| "type": "OBJECT", | |
| "properties": { | |
| "errorReport": { "type": "STRING" }, | |
| "correctedSrt": { "type": "STRING" } | |
| }, | |
| "required": ["errorReport", "correctedSrt"] | |
| } | |
| } | |
| INSTRUCTIONS FOR EACH JSON OBJECT IN THE ARRAY: | |
| 1. **"errorReport"**: Perform OCR on the image for the current pair. Report ONLY significant errors (mismatch, extraneous/missing text). If none, this string MUST be "No significant errors found.". | |
| 2. “correctedSrt”: | |
| Generate a corrected SRT block for the current subtitle pair. | |
| All timestamps must remain exactly as in the original, and the output must contain only the raw SRT text — no explanations, no JSON, no metadata. | |
| Guidelines: | |
| 1. Source Priority: | |
| • Use the OCR image text as the primary source of truth. | |
| • If the OCR result contains Burmese text, use it as the main subtitle text. | |
| 2. Language Inclusion: | |
| • Keep both Burmese and English lines if they exist. | |
| • Do not include any other languages besides Burmese and English. | |
| 3. When OCR Text Is Missing: | |
| • If the OCR image contains no readable text, keep the timestamps exactly the same and output a blank subtitle line. | |
| • Do not delete, skip, or merge any subtitle blocks. | |
| 4. Preservation Rules: | |
| • Always preserve original timestamps, line breaks, and Burmese punctuation (။, ၊). | |
| • Maintain the same block numbering sequence as in the input. | |
| • Do not add missing Burmese punctuation (။) at the end of the line. | |
| • Do not add Burmese punctuation (။) | |
| 5. Output Format: | |
| • Output only the clean, corrected SRT block. | |
| • No explanations, no quotes, no formatting outside the SRT syntax. | |
| Process all pairs provided in the prompt and return a JSON array with the same number of objects as pairs you received.""", | |
| "TRANSLATE_BASE": """You are an expert subtitle translator. Your task is to translate the text in the provided JSON object to {{TARGET_LANGUAGE}}. | |
| 1. The user will provide a JSON object where keys are IDs (e.g., 'line_0', 'line_1') and values are the text lines. | |
| 2. You MUST translate the text *value* for each key into {{TARGET_LANGUAGE}}. | |
| 3. You MUST preserve all SRT/ASS formatting tags exactly as they appear (e.g., \`{\\an8}\`, \`<i>\`, \`</i>\`). Do NOT translate the content of these tags. | |
| 4. You MUST respond ONLY with a valid JSON object, containing the *exact same keys* as the input, with the translated text as the values. | |
| 5. Do not include \`json\` or \`\`\`json markers in your response. Respond only with the JSON object itself. | |
| 6. CRITICAL: Ensure all double quotes (") within the translated text *values* are properly escaped with a backslash (e.g., \\"example\\"). This is essential for the JSON to be valid. | |
| 7. CRITICAL JSON REQUIREMENT: If the subtitle text contains backslashes (e.g., in formatting tags like \`{\\an8}\`, \`\\N\`), you MUST escape them as double backslashes (e.g., \`{\\\\an8}\`, \`\\\\N\`) in the JSON string value. Failure to do this will break the JSON parser.""", | |
| "TRANSLATE_NATURAL_ADDON": """\n**CRITICAL INSTRUCTIONS:** | |
| 1. **Context-Aware Pronouns:** Pay close attention to the flow of conversation (within the batch) to choose the most appropriate pronouns. Translate based on the inferred relationship and formality between speakers. | |
| 2. **Natural Flow:** The translation should sound natural in the target language, not like a literal word-for-word translation. | |
| 3. **Formatting:** Keep punctuation appropriate for the target language.""", | |
| "TRANSLATE_BASIC_ADDON": """\n**CRITICAL INSTRUCTIONS:** | |
| 1. **Direct Translation:** Prioritize accuracy over style.""", | |
| "ANALYZE_CONTEXT": """You are a linguistic expert specializing in Burmese translation context. | |
| Analyze the provided subtitle excerpt (which is the beginning of a movie). | |
| Your goal is to extract context to help a translator choose the correct Burmese Pronouns and Vocabulary. | |
| Please output a concise "Translator's Note" covering: | |
| 1. **Genre & Tone:** (e.g., Action, Romance, Adult, Historical). | |
| 2. **Main Characters & Relationships:** Who is talking to whom? (e.g., "A and B are lovers", "C is D's boss"). | |
| 3. **Pronoun Guide (CRITICAL):** - For each pair of speakers, specify the correct Burmese pronouns. | |
| - Examples: | |
| - "Male to Male (Friends): Use 'Min/Nga' (မင်း/ငါ)" | |
| - "Female to Male (Lovers): Use 'Maung/Mel' or intimate 'Nin/Nga'" | |
| - "Formal/Stranger: Use 'Khim-byar/Kyun-daw' or 'Shin/Kyun-ma'" | |
| **Output Format:** Just provide the analysis text. Do not translate the subtitles yet.""", | |
| "TRANSCRIBE": """ | |
| You are an expert subtitle editor for movies and music videos. | |
| Transcribe the provided audio file. | |
| Your output MUST be *only* in the standard SRT (SubRip Text) file format. | |
| Do not include any other text, explanations, or markdown formatting (like \`\`\`srt). | |
| Follow these professional subtitling rules: | |
| 1. **Timing:** Timestamps must be precise and tightly synced to the spoken words. | |
| 2. **Line Breaks:** Keep subtitles to a maximum of 2 lines. | |
| 3. **Readability:** Break lines at natural pauses, sentence ends, or clauses. Do not leave single words on a line. | |
| 4. **Length:** Aim for a maximum of 42 characters per line. This is a guideline for readability. | |
| 5. **Format:** The SRT format must be strictly followed: | |
| 1 | |
| HH:MM:SS,MS --> HH:MM:SS,MS | |
| First line of text. | |
| Second line of text. | |
| 2 | |
| HH:MM:SS,MS --> HH:MM:SS,MS | |
| Next subtitle.""", | |
| "TRANSCRIBE_CHUNK": """ | |
| You are a precision subtitle generator. | |
| Transcribe ONLY the spoken words in this audio clip. | |
| OUTPUT FORMAT: | |
| Return ONLY a valid JSON array of objects. Do not wrap in markdown. | |
| Structure: [{"start": "MM:SS.mmm", "end": "MM:SS.mmm", "text": "spoken text"}] | |
| CRITICAL RULES: | |
| 1. **NO HALLUCINATIONS:** If there is silence, music only, or no clear speech, return an empty array []. Do NOT invent text like "Welcome to the video", "Subscribe", or "Next steps". | |
| 2. **TIMESTAMPS:** Timestamps must be relative to the beginning of *this specific audio file* (00:00.000). | |
| 3. **VERBATIM:** Transcribe exactly what is said. Do not summarize. | |
| 4. **JSON ONLY:** Raw JSON array only.""", | |
| "ANALYZE_VIDEO": """You are an expert content moderator for a major video platform like YouTube. Your task is to analyze the provided video and assign it one of three moderation levels. You must distinguish between content that is NOT AD-FRIENDLY (but allowed) and content that VIOLATES COMMUNITY GUIDELINES (and must be removed). | |
| **Your 3-Tier Decision:** | |
| 1. **"Safe" (Ad-Friendly):** | |
| * **Description:** The content is clean, safe for all advertisers, and has no issues. | |
| * **Action:** Full monetization. | |
| * **Categories:** [] | |
| 2. **"Borderline" (Not Ad-Friendly):** | |
| * **Description:** The content is ALLOWED on the platform but is NOT suitable for most advertisers. It does NOT break community guidelines. | |
| * **Action:** Limited or no ads (demonetization). | |
| * **Categories:** | |
| * **Inappropriate Language:** Frequent use of profanity. | |
| * **Suggestive Content:** Non-explicit sexual themes, "beach fails," suggestive dancing, revealing outfits that are not nudity. | |
| * **Moderate Violence:** Non-graphic violence (e.g., in news, documentaries, or video games). | |
| * **Sensitive Topics:** Non-graphic discussion of war, tragedy, or other sensitive events. | |
| * **Inauthentic (Low-Effort):** Low-effort, machine-generated slideshows, robotic TTS voices that are not for accessibility. (This is allowed, but often demonetized at a channel level). | |
| 3. **"Violation" (Community Guideline Break):** | |
| * **Description:** The content is NOT ALLOWED on the platform and must be flagged for removal. | |
| * **Action:** Remove video, issue channel strike. | |
| * **Categories:** | |
| * **Hate Speech:** Direct attacks or promotion of violence/hatred against a protected group. | |
| * **Harassment & Bullying:** Malicious, targeted attacks on an individual. | |
| * **Graphic Violence:** Depictions of extreme, gratuitous violence intended to shock (outside of clear, brief news context). | |
| * **Dangerous Acts / Self-Harm:** Promoting or showing in detail acts that could lead to serious injury or suicide. | |
| * **Pornography / Explicit Nudity:** Any explicit sexual acts or nudity intended for sexual gratification (not educational or artistic). | |
| * **Harmful Misinformation:** Content that poses a direct, real-world harm (e.g., medical, civic). | |
| * **Spam & Scams:** Deceptive practices, fraudulent schemes. | |
| **Context is Key:** An educational video on breast cancer (non-sexual nudity) is "Safe". A news report on a conflict (sensitive topic) is "Borderline". A video promoting hatred is a "Violation". | |
| **Your Response Format (JSON):** | |
| You MUST return *only* a valid JSON object with this structure: | |
| { | |
| "decision": "Safe" | "Borderline" | "Violation", | |
| "categories_found": ["category1", "category2", ...] | [], | |
| "reasoning": "A brief, neutral explanation for your decision. Justify *why* it fits the 'Safe', 'Borderline', or 'Violation' tier, citing context." | |
| } | |
| Analyze the video's visual and audio content and return the JSON report.""", | |
| "ANCHOR_FINDER": """You are an expert subtitle anchor point finder. I will give you a JSON payload. | |
| 1. **"source_lines"**: A JSON object where keys are line numbers and values are subtitle text in a foreign language (e.g., Burmese). | |
| 2. **"reference_srt"**: The *entire* text content of the reference .srt file (in English). | |
| Your job is to: | |
| 1. For *each* line in "source_lines", translate it to English. | |
| 2. Semantically search the "reference_srt" to find the *single best matching line*. | |
| 3. You MUST return a JSON object, mapping each *source* line number (as a string key) to the *matching reference* line number (as a number value). | |
| **EXAMPLE:** | |
| - **Source:** \`{"100": "မြန်မာစာ..."}\` | |
| - **Reference:** "...110...Hello...111...How are you...112...Burmese text..." | |
| - **Your Response:** \`{"100": 112}\` | |
| """, | |
| "REPHRASE_DEFAULT": """You are an expert subtitle editor. Your task is to rephrase the text in the provided JSON object. | |
| 1. The user will provide a JSON object where keys are IDs (e.g., 'line_0', 'line_1') and values are the original subtitle text lines. | |
| 2. You MUST rephrase the text *value* for each key. The meaning must be identical, but the wording should be different (use synonyms, change sentence structure). | |
| 3. You MUST preserve all SRT/ASS formatting tags exactly as they appear (e.g., \`{\\an8}\`, \`<i>\`, \`</i>\`). Do NOT alter the content of these tags. | |
| 4. You MUST respond ONLY with a valid JSON object, containing the *exact same keys* as the input, with the rephrased text as the values. | |
| 5. Do not include \`json\` or \`\`\`json markers in your response. Respond only with the JSON object itself. | |
| 6. CRITICAL: Ensure all double quotes (") within the rephrased text *values* are properly escaped with a backslash (e.g., \\"example\\"). This is essential for the JSON to be valid. | |
| 7. Maintain the original language. Do not translate. | |
| 8. Do not change pronouns. | |
| 9. CRITICAL JSON REQUIREMENT: If the subtitle text contains backslashes (e.g., in formatting tags like \`{\\an8}\`, \`\\N\`), you MUST escape them as double backslashes (e.g., \`{\\\\an8}\`, \`\\\\N\`) in the JSON string value. Failure to do this will break the JSON parser.""", | |
| "LINE_FIXER_BURMESE": """You are a professional Burmese subtitle editor. Your task is to fix line breaks and split long subtitles. | |
| RULES: | |
| 1. NO CONTENT CHANGE: Do not add, delete, or change words. Only adjust whitespace/newlines. | |
| 2. STRICT MAX 2 LINES: Every subtitle event must be 1 or 2 lines. Never 3. | |
| 3. SPLIT LONG LINES (CRITICAL): | |
| - If a subtitle is too long for 2 lines, you MUST split it into multiple separate events. | |
| - Return an ARRAY of strings for that ID. | |
| - Example: "12": ["Part 1 text...", "Part 2 text..."] | |
| 4. BURMESE GRAMMAR & FLOW (CRITICAL): | |
| - NO ORPHANS: Never leave a single short word (like "ထင်တယ်", "တယ်", "မယ်", "ပါ", "နော်") on a new line by itself. Join it to the previous line. | |
| - NO SEPARATION: Do not separate modifiers like "အဲဒီ", "ဒီ", "ဟို" from the following noun. Do not start a line with particles. | |
| - BALANCE: Try to make lines roughly equal length, UNLESS it breaks a grammar rule. Grammar always wins. | |
| 5. JSON FORMAT: Respond only with valid JSON. Escape double quotes properly. | |
| CRITICAL JSON REQUIREMENT: If the subtitle text contains backslashes (e.g., in formatting tags like \`{\\an8}\`, \`\\N\`), you MUST escape them as double backslashes (e.g., \`{\\\\an8}\`, \`\\\\N\`) in the JSON string value. Failure to do this will break the JSON parser.""", | |
| "LINE_FIXER_ENGLISH": """You are a Netflix Certified Subtitle QC Editor. Your task is to conform the provided English subtitles to the "Netflix English Timed Text Style Guide". | |
| RULES: | |
| 1. **Character Limitation:** Maximum 42 characters per line. Max 2 lines. Split if longer (return Array). | |
| 2. **Line Breaking Logic:** No splitting noun/article, name/surname. Break after punctuation/before conjunctions. | |
| 3. **Text Clean-Up:** Smart quotes. Smart ellipsis. Remove speaker labels unless off-screen. Remove filler words. | |
| 4. **JSON FORMAT:** Respond only with valid JSON. Key = ID. Value = String or Array of Strings. Escape double quotes. | |
| CRITICAL JSON REQUIREMENT: If the subtitle text contains backslashes (e.g., in formatting tags like \`{\\an8}\`, \`\\N\`), you MUST escape them as double backslashes (e.g., \`{\\\\an8}\`, \`\\\\N\`) in the JSON string value. Failure to do this will break the JSON parser.""", | |
| "LINE_FIXER_GENERAL": """You are a Universal Subtitle Formatter. Your task is to clean and format raw text into professional subtitles for ANY language. | |
| RULES: | |
| 1. **Structure:** Strictly 1 or 2 lines. Split >40 chars or >80 chars total into separate events (return Array). | |
| 2. **Pyramid Formatting:** Prefer "Bottom-Heavy" pyramid. | |
| 3. **Clean Up:** Fix basic punctuation. Remove double spaces. | |
| 4. **Restriction:** Do not change meaning. | |
| 5. **JSON FORMAT:** Respond only with valid JSON. Key = ID. Value = String or Array. Escape double quotes. | |
| CRITICAL JSON REQUIREMENT: If the subtitle text contains backslashes (e.g., in formatting tags like \`{\\an8}\`, \`\\N\`), you MUST escape them as double backslashes (e.g., \`{\\\\an8}\`, \`\\\\N\`) in the JSON string value. Failure to do this will break the JSON parser.""", | |
| "LINE_FIXER_OVERLAP_ADDON": """\n\n6. **TIMING & OVERLAP OPTIMIZATION (IMPORTANT):**\n - Since you are splitting lines, ensure each split part is concise and short. \n - Avoid cramming too much text into one block, as this causes reading speed issues and display overlaps.""", | |
| "SUSPECT_CHECK": """You are a Senior Subtitle Quality Control (QC) Specialist specializing in Asian Language (Burmese/Korean/English) contexts. | |
| **YOUR GOAL:** Identify *High-Confidence Logic Errors* that break immersion. | |
| **DO NOT** flag minor stylistic choices, slightly awkward phrasing, or standard grammar variations. | |
| **ONLY** return a result if you are >90% sure it is an error. | |
| **LOOK FOR THESE SPECIFIC ERROR TYPES:** | |
| 1. **Contextual Logic Failures (The "Sheep vs. Thief" Rule):** | |
| - Text that is technically a valid word but makes NO SENSE in the specific scene context. | |
| - *Example:* A character trying to sleep counts "Thief 1, Thief 2" (Burmese 'Thu-khoe') instead of "Sheep 1, Sheep 2" (Burmese 'Thoe'). | |
| - *Example:* A character in a car says "I bought a *flower* to drive" (phonetic mix-up) instead of "car". | |
| 2. **Name/Entity Inconsistency:** | |
| - A character's name changes spelling within the batch or compared to common transliteration norms. | |
| - *Example:* "Lee Gi-wu" becomes "Lee Gi-pu" or "Yi Ki-woo" in the same scene. | |
| - *Example:* "Gyaung-toe" (weird transliteration) vs "Gyeonggi-do" (standard place name). | |
| 3. **Nonsense / Typo / OCR Garbage:** | |
| - Words that appear to be keyboard smashes or phonetic errors that result in gibberish. | |
| - *Example:* "Dog Night" (Khway Nya) vs "Sniffer Dog" (Khway Nan) in a detective context. | |
| *** SPECIAL INSTRUCTION *** | |
| In addition to the standard logic checks, you MUST flag lines containing: | |
| 1. Broken Burmese encoding or rendering issues (e.g. 'န ေါ ေ', misplaced independent vowels, or character ordering errors). | |
| 2. Obvious keyboard mashing or nonsensical gibberish. | |
| You must flag these as 'Encoding/Nonsense' errors even if you are not 90% confident about the intended meaning. These are high-priority formatting errors. | |
| *** END SPECIAL INSTRUCTION *** | |
| **INPUT FORMAT:** | |
| A block of text containing subtitle lines prefixed with their Line ID (e.g., "ID: [Text]"). | |
| **OUTPUT FORMAT:** | |
| Return a valid JSON Object containing a key "suspectLines". This key must be an Array of Objects. | |
| { | |
| "suspectLines": [ | |
| { | |
| "id": "Line ID", | |
| "original": "The original suspect text", | |
| "reason": "A concise explanation of the logic error (e.g., 'Context: Counting sheep for sleep, not thieves.')", | |
| "suggestion": "The proposed correction" | |
| } | |
| ] | |
| } | |
| If no *critical* errors are found in a batch, return { "suspectLines": [] }. | |
| CRITICAL: Return ONLY raw JSON. No markdown.""" | |
| } | |
| # --- Pydantic Models for Validation --- | |
| class RequestPayload(BaseModel): | |
| apiKey: str | |
| model: str | |
| task: str | |
| data: Dict[str, Any] = {} | |
| def get_safety_settings(): | |
| return [ | |
| {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, | |
| {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, | |
| {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, | |
| {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, | |
| ] | |
| async def handle_post(payload: RequestPayload): | |
| # 1. Nuclear Cleaning of critical inputs | |
| api_key = clean_input(payload.apiKey) | |
| model = clean_input(payload.model) | |
| task = str(payload.task).strip() | |
| data = payload.data | |
| if not api_key or not model or not task: | |
| raise HTTPException(status_code=400, detail="Missing apiKey, model, or task.") | |
| gemini_payload = None | |
| try: | |
| if task == "ping": | |
| return {"status": "success", "message": "Pong! FastAPI Worker is active."} | |
| elif task == "verify": | |
| if not data.get("parts"): raise ValueError("Missing 'data.parts'") | |
| is_batch = data.get("batchSize", 0) > 1 | |
| prompt = PROMPTS["VERIFY_BATCH"] if is_batch else PROMPTS["VERIFY_SINGLE"] | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": prompt}]}, | |
| "contents": [{"role": "user", "parts": data["parts"]}], | |
| "generationConfig": {"responseMimeType": "application/json"}, | |
| "safetySettings": get_safety_settings(), | |
| } | |
| elif task == "translate": | |
| target_lang = data.get("targetLanguage", "Burmese (Myanmar)") | |
| prompt = PROMPTS["TRANSLATE_BASE"].replace("{{TARGET_LANGUAGE}}", target_lang) | |
| prompt += PROMPTS["TRANSLATE_BASIC_ADDON"] if data.get("promptVersion") == "basic" else PROMPTS["TRANSLATE_NATURAL_ADDON"] | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": prompt}]}, | |
| "contents": [{"parts": [{"text": data.get("textBatch", "")}]}], | |
| "safetySettings": get_safety_settings(), | |
| } | |
| elif task == "transcribe" or task == "transcribe_chunk": | |
| prompt = PROMPTS["TRANSCRIBE"] if task == "transcribe" else PROMPTS["TRANSCRIBE_CHUNK"] | |
| gemini_payload = { | |
| "contents": [{"parts": [{"text": prompt}, {"inlineData": {"mimeType": data["mimeType"], "data": data["audioData"]}}]}], | |
| "safetySettings": get_safety_settings(), | |
| } | |
| if task == "transcribe_chunk": | |
| gemini_payload["generationConfig"] = {"responseMimeType": "application/json"} | |
| elif task == "line_fixer": | |
| mode = data.get("mode", "general") | |
| prompt = PROMPTS.get(f"LINE_FIXER_{mode.upper()}", PROMPTS["LINE_FIXER_GENERAL"]) | |
| if data.get("preventOverlap"): prompt += PROMPTS["LINE_FIXER_OVERLAP_ADDON"] | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": prompt}]}, | |
| "contents": [{"parts": [{"text": data.get("textBatch", "")}]}], | |
| "safetySettings": get_safety_settings(), | |
| } | |
| elif task == "analyze_video": | |
| # Requires videoData (base64) and mimeType | |
| if "videoData" not in data or "mimeType" not in data: | |
| raise ValueError("analyze_video requires 'videoData' and 'mimeType'") | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": PROMPTS["ANALYZE_VIDEO"]}]}, | |
| "contents": [{ | |
| "parts": [ | |
| {"inlineData": {"mimeType": data["mimeType"], "data": data["videoData"]}} | |
| ] | |
| }], | |
| # The prompt explicitly asks for a JSON object response | |
| "generationConfig": {"responseMimeType": "application/json"}, | |
| "safetySettings": get_safety_settings(), | |
| } | |
| elif task == "anchor_find": | |
| import json # Ensure json is available for dumping the payload | |
| # The prompt expects a strict JSON payload with "source_lines" and "reference_srt" | |
| # We assume the client sends this structure inside data["payload"] | |
| payload_content = data.get("payload", {}) | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": PROMPTS["ANCHOR_FINDER"]}]}, | |
| "contents": [{"parts": [{"text": json.dumps(payload_content)}]}], | |
| "generationConfig": {"responseMimeType": "application/json"}, | |
| "safetySettings": get_safety_settings(), | |
| } | |
| elif task == "rephrase": | |
| # Rephrasing expects a JSON object of lines and returns a JSON object | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": PROMPTS["REPHRASE_DEFAULT"]}]}, | |
| "contents": [{"parts": [{"text": data.get("textBatch", "")}]}], | |
| "generationConfig": {"responseMimeType": "application/json"}, | |
| "safetySettings": get_safety_settings(), | |
| } | |
| elif task == "suspect_check": | |
| # Suspect check analyzes text and returns a JSON report | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": PROMPTS["SUSPECT_CHECK"]}]}, | |
| "contents": [{"parts": [{"text": data.get("textBatch", "")}]}], | |
| "generationConfig": {"responseMimeType": "application/json"}, | |
| "safetySettings": get_safety_settings(), | |
| } | |
| elif task == "analyze_context": | |
| # Context analysis returns plain text (Translator's Note), no JSON mode needed | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": PROMPTS["ANALYZE_CONTEXT"]}]}, | |
| "contents": [{"parts": [{"text": data.get("textBatch", "")}]}], | |
| "safetySettings": get_safety_settings(), | |
| } | |
| else: | |
| # Default fallback for unhandled tasks in this snippet | |
| if task in PROMPTS: | |
| gemini_payload = { | |
| "systemInstruction": {"parts": [{"text": PROMPTS[task]}]}, | |
| "contents": [{"parts": [{"text": str(data.get("textBatch") or data.get("payload") or "")}]}], | |
| "safetySettings": get_safety_settings(), | |
| } | |
| else: | |
| raise HTTPException(status_code=400, detail=f"Unknown task: {task}") | |
| except Exception as e: | |
| raise HTTPException(status_code=400, detail=f"Payload Construction Error: {str(e)}") | |
| # --- EXECUTE GEMINI CALL (Robust Patch) --- | |
| # Using httpx.URL object to strictly define scheme and host | |
| target_url = httpx.URL( | |
| scheme="https", | |
| host="generativelanguage.googleapis.com", | |
| path=f"/v1beta/models/{model}:generateContent", | |
| params={"key": api_key} | |
| ) | |
| async with httpx.AsyncClient(timeout=300.0) as client: | |
| try: | |
| api_response = await client.post( | |
| target_url, | |
| json=gemini_payload, | |
| headers={"Content-Type": "application/json"} | |
| ) | |
| if api_response.status_code != 200: | |
| logger.error(f"Gemini API Error: {api_response.text}") | |
| raise HTTPException(status_code=api_response.status_code, detail=f"Gemini API Error: {api_response.text}") | |
| return api_response.json() | |
| except httpx.RequestError as e: | |
| logger.error(f"Network Error: {str(e)}") | |
| raise HTTPException(status_code=500, detail=f"Network Error: {str(e)}") |