Spaces:
No application file
No application file
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # COMPATIBILITY PATCHES β must run before any other import | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| import sys, types | |
| # 1) Python 3.13 removed audioop; pydub needs it via gradio 4.44 | |
| for _m in ("audioop", "pyaudioop"): | |
| if _m not in sys.modules: | |
| sys.modules[_m] = types.ModuleType(_m) | |
| # 2) gradio_client schema patches (bool/non-dict schema guard) | |
| import gradio_client.utils as _gcu | |
| _orig_get_type = _gcu.get_type | |
| _orig_inner = _gcu._json_schema_to_python_type | |
| _orig_j2p = _gcu.json_schema_to_python_type | |
| def _safe_get_type(schema): | |
| if not isinstance(schema, dict): return {} | |
| return _orig_get_type(schema) | |
| def _safe_inner(schema, defs=None): | |
| if not isinstance(schema, dict): return "Any" | |
| return _orig_inner(schema, defs) | |
| def _safe_j2p(schema): | |
| if not isinstance(schema, dict): return "Any" | |
| return _orig_j2p(schema) | |
| _gcu.get_type = _safe_get_type | |
| _gcu._json_schema_to_python_type = _safe_inner | |
| _gcu.json_schema_to_python_type = _safe_j2p | |
| # 3) FIX: gradio 4.44.0 calls templates.TemplateResponse(dict, dict) β passes | |
| # the context dict as the first arg (template name) instead of a string. | |
| # Starlette's TemplateResponse signature is (name: str, context: dict, ...). | |
| # We patch TemplateResponse to detect the swapped args and correct them. | |
| import starlette.templating as _st | |
| _OrigJinja2Templates = _st.Jinja2Templates | |
| class _PatchedJinja2Templates(_OrigJinja2Templates): | |
| def TemplateResponse(self, *args, **kwargs): | |
| # Gradio 4.44 bug: passes (dict_context, dict_context) β first arg is | |
| # a dict instead of the template name string. | |
| if args and isinstance(args[0], dict): | |
| ctx = args[0] | |
| # The real template name lives in the context under "request" scope; | |
| # gradio always uses "index.html" as its main template. | |
| name = ctx.pop("__template_name__", None) or "index.html" | |
| # Reconstruct as (name, context, *rest) | |
| args = (name, ctx) + args[1:] | |
| return super().TemplateResponse(*args, **kwargs) | |
| # Patch into starlette.templating so any already-imported reference is updated | |
| _st.Jinja2Templates = _PatchedJinja2Templates | |
| # Also patch it into gradio.routes which imported Jinja2Templates at load time | |
| try: | |
| import gradio.routes as _gr | |
| if hasattr(_gr, 'templates') and _gr.templates is not None: | |
| _gr.templates.__class__ = _PatchedJinja2Templates | |
| except Exception: | |
| pass | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MAIN APP | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| import gradio as gr | |
| import re, os, tempfile | |
| from huggingface_hub import InferenceClient | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| MODEL_ID = "google/gemma-2-9b-it" | |
| client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) | |
| # ββ SRT helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def parse_srt(content: str) -> list: | |
| blocks = [] | |
| for block in re.split(r'\n\s*\n', content.strip()): | |
| lines = block.strip().splitlines() | |
| if len(lines) < 2 or not lines[0].strip().isdigit() or '-->' not in lines[1]: | |
| continue | |
| blocks.append({ | |
| 'number': int(lines[0].strip()), | |
| 'timecode': lines[1].strip(), | |
| 'text': '\n'.join(lines[2:]).strip(), | |
| }) | |
| return blocks | |
| def blocks_to_srt(blocks: list) -> str: | |
| return '\n\n'.join( | |
| f"{i}\n{b['timecode']}\n{b['text']}" | |
| for i, b in enumerate(blocks, 1) | |
| ) + '\n' | |
| # ββ Detection ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| REMOVE_RE = re.compile( | |
| r'https?://\S+|www\.\S+\.\S+' | |
| r'|subscene|opensubtitles|addic7ed|podnapisi|subdownloader' | |
| r'|subtitles?\s+by\s+\w+|translated\s+by\s+\w+' | |
| r'|encoded\s+by|synced\s+by|ripped\s+by|corrected\s+by' | |
| r'|\bcopyright\b|Β©|all\s+rights\s+reserved' | |
| r'|downloaded\s+from|subtitle\s+group', | |
| re.IGNORECASE | |
| ) | |
| def is_metadata(text: str) -> bool: | |
| return bool(REMOVE_RE.search(text)) | |
| def gemma_classify(text: str) -> str: | |
| if not text.strip(): | |
| return 'REMOVE' | |
| prompt = ( | |
| "<start_of_turn>user\n" | |
| "Is this subtitle text metadata (credits/copyright/website/promo) or real dialogue?\n" | |
| "Reply ONLY 'REMOVE' or 'KEEP'.\n\nText:\n" + text + | |
| "\n<end_of_turn>\n<start_of_turn>model\n" | |
| ) | |
| try: | |
| r = client.text_generation(prompt, max_new_tokens=10, temperature=0.1, do_sample=False) | |
| return 'REMOVE' if 'REMOVE' in r.strip().upper() else 'KEEP' | |
| except Exception as e: | |
| print(f"[Gemma] {e}") | |
| return 'KEEP' | |
| # ββ Core βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process(content: str, use_gemma: bool): | |
| blocks = parse_srt(content) | |
| if not blocks: | |
| return "", "β οΈ ΰΆΰ·ΰ·ΰ·ΰΆΈ subtitle block ΰΆΰΆΰΆΰ· ΰ·ΰΆΈΰ· ΰΆ±ΰ·ΰ·ΰ·ΰΆΊ." | |
| kept, log = [], [] | |
| for b in blocks: | |
| t = b['text'] | |
| if is_metadata(t): | |
| log.append(f"#{b['number']} [REGEX]: {t[:80]}") | |
| continue | |
| if use_gemma and t.strip() and gemma_classify(t) == 'REMOVE': | |
| log.append(f"#{b['number']} [GEMMA]: {t[:80]}") | |
| continue | |
| kept.append(b) | |
| report = [ | |
| f"β ΰ·ΰΆΈΰ·ΰΆ΄ΰ·ΰΆ»ΰ·ΰΆ« blocks: {len(blocks)}", | |
| f"ποΈ ΰΆΰ·ΰΆΰ· ΰΆΰ· blocks: {len(blocks)-len(kept)}", | |
| f"βοΈ ΰΆΰΆΰ·ΰΆ»ΰ· blocks: {len(kept)}", "", | |
| ] | |
| if log: | |
| report += ["ΰΆΰ·ΰΆΰ· ΰΆΰ· lines:"] + [f" β’ {l}" for l in log] | |
| return blocks_to_srt(kept), '\n'.join(report) | |
| def save_tmp(text: str) -> str: | |
| f = tempfile.NamedTemporaryFile(mode='w', suffix='_cleaned.srt', | |
| encoding='utf-8', delete=False) | |
| f.write(text); f.close() | |
| return f.name | |
| # ββ Handlers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def handle_file(path, use_gemma): | |
| if not path: | |
| return "", "ΰΆΰ·ΰΆ±ΰ·ΰ·ΰΆΰ· ΰΆΰ·ΰΆ»ΰΆ±ΰ·ΰΆ±.", None | |
| try: | |
| with open(path, encoding='utf-8-sig', errors='replace') as f: | |
| content = f.read() | |
| except Exception as e: | |
| return "", f"β {e}", None | |
| cleaned, report = process(content, use_gemma) | |
| return cleaned, report, save_tmp(cleaned) if cleaned else None | |
| def handle_text(text, use_gemma): | |
| if not text.strip(): | |
| return "", "β οΈ SRT content ΰΆΰΆΰ·ΰ· ΰ· ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±.", None | |
| cleaned, report = process(text, use_gemma) | |
| return cleaned, report, save_tmp(cleaned) if cleaned else None | |
| # ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| css = ".mono{font-family:monospace;font-size:13px} footer{display:none!important}" | |
| with gr.Blocks(title="π¬ Sinhala SRT Cleaner", theme=gr.themes.Soft(), css=css) as demo: | |
| gr.Markdown("# π¬ ΰ·ΰ·ΰΆΰ·ΰΆ½ SRT Subtitle Cleaner\n**Gemma 2 9B** ΰΆ·ΰ·ΰ·ΰ·ΰΆΰ· ΰΆΰΆ» author credits, website URLs, copyright notices ΰΆΰ·ΰΆΰ· ΰΆΰΆ»ΰΆΊΰ·.") | |
| with gr.Tabs(): | |
| with gr.TabItem("π ΰΆΰ·ΰΆ±ΰ· Upload"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_in = gr.File(label="SRT ΰΆΰ·ΰΆ±ΰ·ΰ·", file_types=[".srt"], type="filepath") | |
| gemma_cb1 = gr.Checkbox(label="π€ Gemma AI", value=True, | |
| info="Uncheck β Regex only (ΰΆΰΆΰ·ΰΆΈΰΆ±ΰ·)") | |
| btn1 = gr.Button("π Process", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| report1 = gr.Textbox(label="π Report", lines=8, interactive=False) | |
| preview1 = gr.Textbox(label="β Cleaned Preview", lines=15, | |
| interactive=False, elem_classes="mono") | |
| dl1 = gr.File(label="β¬οΈ Download") | |
| btn1.click(fn=handle_file, inputs=[file_in, gemma_cb1], | |
| outputs=[preview1, report1, dl1]) | |
| with gr.TabItem("π Text Paste"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_in = gr.Textbox(label="SRT Content", lines=15, elem_classes="mono", | |
| placeholder="1\n00:00:01,000 --> 00:00:04,000\nExample...") | |
| gemma_cb2 = gr.Checkbox(label="π€ Gemma AI", value=True) | |
| btn2 = gr.Button("π Process", variant="primary") | |
| with gr.Column(): | |
| report2 = gr.Textbox(label="π Report", lines=6, interactive=False) | |
| preview2 = gr.Textbox(label="β Cleaned Output", lines=15, | |
| interactive=False, elem_classes="mono") | |
| dl2 = gr.File(label="β¬οΈ Download") | |
| btn2.click(fn=handle_text, inputs=[text_in, gemma_cb2], | |
| outputs=[preview2, report2, dl2]) | |
| with gr.TabItem("π API"): | |
| gr.Markdown(""" | |
| ## API Endpoint | |
| ### Python β File | |
| ```python | |
| from gradio_client import Client | |
| client = Client("your-username/srt-cleaner") | |
| cleaned, report, file = client.predict( | |
| "path/to/file.srt", True, api_name="/handle_file" | |
| ) | |
| ``` | |
| ### Python β Text | |
| ```python | |
| cleaned, report, file = client.predict( | |
| "1\\n00:00:01,000 --> 00:00:04,000\\nwww.subscene.com\\n\\n2\\n...", | |
| True, api_name="/handle_text" | |
| ) | |
| ``` | |
| ### cURL | |
| ```bash | |
| curl -X POST "https://your-space.hf.space/run/predict" \\ | |
| -H "Content-Type: application/json" \\ | |
| -d '{"fn_index":1,"data":["SRT_CONTENT",true]}' | |
| ``` | |
| """) | |
| gr.Markdown("---\n**Setup:** Space Settings β Secrets β `HF_TOKEN` set ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±.") | |
| demo.queue(max_size=10) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |