# ═══════════════════════════════════════════════════════════════════════════════ # COMPATIBILITY PATCHES — must run before any other import # ═══════════════════════════════════════════════════════════════════════════════ import sys, types # 1) Python 3.13 removed audioop; pydub needs it via gradio 4.44 for _m in ("audioop", "pyaudioop"): if _m not in sys.modules: sys.modules[_m] = types.ModuleType(_m) # 2) gradio_client schema patches (bool/non-dict schema guard) import gradio_client.utils as _gcu _orig_get_type = _gcu.get_type _orig_inner = _gcu._json_schema_to_python_type _orig_j2p = _gcu.json_schema_to_python_type def _safe_get_type(schema): if not isinstance(schema, dict): return {} return _orig_get_type(schema) def _safe_inner(schema, defs=None): if not isinstance(schema, dict): return "Any" return _orig_inner(schema, defs) def _safe_j2p(schema): if not isinstance(schema, dict): return "Any" return _orig_j2p(schema) _gcu.get_type = _safe_get_type _gcu._json_schema_to_python_type = _safe_inner _gcu.json_schema_to_python_type = _safe_j2p # 3) FIX: gradio 4.44.0 calls templates.TemplateResponse(dict, dict) — passes # the context dict as the first arg (template name) instead of a string. # Starlette's TemplateResponse signature is (name: str, context: dict, ...). # We patch TemplateResponse to detect the swapped args and correct them. import starlette.templating as _st _OrigJinja2Templates = _st.Jinja2Templates class _PatchedJinja2Templates(_OrigJinja2Templates): def TemplateResponse(self, *args, **kwargs): # Gradio 4.44 bug: passes (dict_context, dict_context) — first arg is # a dict instead of the template name string. if args and isinstance(args[0], dict): ctx = args[0] # The real template name lives in the context under "request" scope; # gradio always uses "index.html" as its main template. name = ctx.pop("__template_name__", None) or "index.html" # Reconstruct as (name, context, *rest) args = (name, ctx) + args[1:] return super().TemplateResponse(*args, **kwargs) # Patch into starlette.templating so any already-imported reference is updated _st.Jinja2Templates = _PatchedJinja2Templates # Also patch it into gradio.routes which imported Jinja2Templates at load time try: import gradio.routes as _gr if hasattr(_gr, 'templates') and _gr.templates is not None: _gr.templates.__class__ = _PatchedJinja2Templates except Exception: pass # ═══════════════════════════════════════════════════════════════════════════════ # MAIN APP # ═══════════════════════════════════════════════════════════════════════════════ import gradio as gr import re, os, tempfile from huggingface_hub import InferenceClient HF_TOKEN = os.environ.get("HF_TOKEN", "") MODEL_ID = "google/gemma-2-9b-it" client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) # ── SRT helpers ──────────────────────────────────────────────────────────────── def parse_srt(content: str) -> list: blocks = [] for block in re.split(r'\n\s*\n', content.strip()): lines = block.strip().splitlines() if len(lines) < 2 or not lines[0].strip().isdigit() or '-->' not in lines[1]: continue blocks.append({ 'number': int(lines[0].strip()), 'timecode': lines[1].strip(), 'text': '\n'.join(lines[2:]).strip(), }) return blocks def blocks_to_srt(blocks: list) -> str: return '\n\n'.join( f"{i}\n{b['timecode']}\n{b['text']}" for i, b in enumerate(blocks, 1) ) + '\n' # ── Detection ────────────────────────────────────────────────────────────────── REMOVE_RE = re.compile( r'https?://\S+|www\.\S+\.\S+' r'|subscene|opensubtitles|addic7ed|podnapisi|subdownloader' r'|subtitles?\s+by\s+\w+|translated\s+by\s+\w+' r'|encoded\s+by|synced\s+by|ripped\s+by|corrected\s+by' r'|\bcopyright\b|©|all\s+rights\s+reserved' r'|downloaded\s+from|subtitle\s+group', re.IGNORECASE ) def is_metadata(text: str) -> bool: return bool(REMOVE_RE.search(text)) def gemma_classify(text: str) -> str: if not text.strip(): return 'REMOVE' prompt = ( "user\n" "Is this subtitle text metadata (credits/copyright/website/promo) or real dialogue?\n" "Reply ONLY 'REMOVE' or 'KEEP'.\n\nText:\n" + text + "\n\nmodel\n" ) try: r = client.text_generation(prompt, max_new_tokens=10, temperature=0.1, do_sample=False) return 'REMOVE' if 'REMOVE' in r.strip().upper() else 'KEEP' except Exception as e: print(f"[Gemma] {e}") return 'KEEP' # ── Core ─────────────────────────────────────────────────────────────────────── def process(content: str, use_gemma: bool): blocks = parse_srt(content) if not blocks: return "", "⚠️ කිසිම subtitle block එකක් හමු නොවීය." kept, log = [], [] for b in blocks: t = b['text'] if is_metadata(t): log.append(f"#{b['number']} [REGEX]: {t[:80]}") continue if use_gemma and t.strip() and gemma_classify(t) == 'REMOVE': log.append(f"#{b['number']} [GEMMA]: {t[:80]}") continue kept.append(b) report = [ f"✅ සම්පූර්ණ blocks: {len(blocks)}", f"🗑️ ඉවත් කළ blocks: {len(blocks)-len(kept)}", f"✔️ ඉතිරි blocks: {len(kept)}", "", ] if log: report += ["ඉවත් කළ lines:"] + [f" • {l}" for l in log] return blocks_to_srt(kept), '\n'.join(report) def save_tmp(text: str) -> str: f = tempfile.NamedTemporaryFile(mode='w', suffix='_cleaned.srt', encoding='utf-8', delete=False) f.write(text); f.close() return f.name # ── Handlers ─────────────────────────────────────────────────────────────────── def handle_file(path, use_gemma): if not path: return "", "ගොනුවක් තෝරන්න.", None try: with open(path, encoding='utf-8-sig', errors='replace') as f: content = f.read() except Exception as e: return "", f"❌ {e}", None cleaned, report = process(content, use_gemma) return cleaned, report, save_tmp(cleaned) if cleaned else None def handle_text(text, use_gemma): if not text.strip(): return "", "⚠️ SRT content ඇතුළු කරන්න.", None cleaned, report = process(text, use_gemma) return cleaned, report, save_tmp(cleaned) if cleaned else None # ── UI ───────────────────────────────────────────────────────────────────────── css = ".mono{font-family:monospace;font-size:13px} footer{display:none!important}" with gr.Blocks(title="🎬 Sinhala SRT Cleaner", theme=gr.themes.Soft(), css=css) as demo: gr.Markdown("# 🎬 සිංහල SRT Subtitle Cleaner\n**Gemma 2 9B** භාවිතා කර author credits, website URLs, copyright notices ඉවත් කරයි.") with gr.Tabs(): with gr.TabItem("📁 ගොනු Upload"): with gr.Row(): with gr.Column(scale=1): file_in = gr.File(label="SRT ගොනුව", file_types=[".srt"], type="filepath") gemma_cb1 = gr.Checkbox(label="🤖 Gemma AI", value=True, info="Uncheck → Regex only (ඉක්මන්)") btn1 = gr.Button("🚀 Process", variant="primary", size="lg") with gr.Column(scale=2): report1 = gr.Textbox(label="📊 Report", lines=8, interactive=False) preview1 = gr.Textbox(label="✅ Cleaned Preview", lines=15, interactive=False, elem_classes="mono") dl1 = gr.File(label="⬇️ Download") btn1.click(fn=handle_file, inputs=[file_in, gemma_cb1], outputs=[preview1, report1, dl1]) with gr.TabItem("📝 Text Paste"): with gr.Row(): with gr.Column(): text_in = gr.Textbox(label="SRT Content", lines=15, elem_classes="mono", placeholder="1\n00:00:01,000 --> 00:00:04,000\nExample...") gemma_cb2 = gr.Checkbox(label="🤖 Gemma AI", value=True) btn2 = gr.Button("🚀 Process", variant="primary") with gr.Column(): report2 = gr.Textbox(label="📊 Report", lines=6, interactive=False) preview2 = gr.Textbox(label="✅ Cleaned Output", lines=15, interactive=False, elem_classes="mono") dl2 = gr.File(label="⬇️ Download") btn2.click(fn=handle_text, inputs=[text_in, gemma_cb2], outputs=[preview2, report2, dl2]) with gr.TabItem("🔌 API"): gr.Markdown(""" ## API Endpoint ### Python — File ```python from gradio_client import Client client = Client("your-username/srt-cleaner") cleaned, report, file = client.predict( "path/to/file.srt", True, api_name="/handle_file" ) ``` ### Python — Text ```python cleaned, report, file = client.predict( "1\\n00:00:01,000 --> 00:00:04,000\\nwww.subscene.com\\n\\n2\\n...", True, api_name="/handle_text" ) ``` ### cURL ```bash curl -X POST "https://your-space.hf.space/run/predict" \\ -H "Content-Type: application/json" \\ -d '{"fn_index":1,"data":["SRT_CONTENT",true]}' ``` """) gr.Markdown("---\n**Setup:** Space Settings → Secrets → `HF_TOKEN` set කරන්න.") demo.queue(max_size=10) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)