Subtitiles / app.py
mrpoddaa's picture
Create app.py
de07519 verified
Raw
History Blame Contribute Delete
11.3 kB
# ═══════════════════════════════════════════════════════════════════════════════
# COMPATIBILITY PATCHES β€” must run before any other import
# ═══════════════════════════════════════════════════════════════════════════════
import sys, types
# 1) Python 3.13 removed audioop; pydub needs it via gradio 4.44
for _m in ("audioop", "pyaudioop"):
if _m not in sys.modules:
sys.modules[_m] = types.ModuleType(_m)
# 2) gradio_client schema patches (bool/non-dict schema guard)
import gradio_client.utils as _gcu
_orig_get_type = _gcu.get_type
_orig_inner = _gcu._json_schema_to_python_type
_orig_j2p = _gcu.json_schema_to_python_type
def _safe_get_type(schema):
if not isinstance(schema, dict): return {}
return _orig_get_type(schema)
def _safe_inner(schema, defs=None):
if not isinstance(schema, dict): return "Any"
return _orig_inner(schema, defs)
def _safe_j2p(schema):
if not isinstance(schema, dict): return "Any"
return _orig_j2p(schema)
_gcu.get_type = _safe_get_type
_gcu._json_schema_to_python_type = _safe_inner
_gcu.json_schema_to_python_type = _safe_j2p
# 3) FIX: gradio 4.44.0 calls templates.TemplateResponse(dict, dict) β€” passes
# the context dict as the first arg (template name) instead of a string.
# Starlette's TemplateResponse signature is (name: str, context: dict, ...).
# We patch TemplateResponse to detect the swapped args and correct them.
import starlette.templating as _st
_OrigJinja2Templates = _st.Jinja2Templates
class _PatchedJinja2Templates(_OrigJinja2Templates):
def TemplateResponse(self, *args, **kwargs):
# Gradio 4.44 bug: passes (dict_context, dict_context) β€” first arg is
# a dict instead of the template name string.
if args and isinstance(args[0], dict):
ctx = args[0]
# The real template name lives in the context under "request" scope;
# gradio always uses "index.html" as its main template.
name = ctx.pop("__template_name__", None) or "index.html"
# Reconstruct as (name, context, *rest)
args = (name, ctx) + args[1:]
return super().TemplateResponse(*args, **kwargs)
# Patch into starlette.templating so any already-imported reference is updated
_st.Jinja2Templates = _PatchedJinja2Templates
# Also patch it into gradio.routes which imported Jinja2Templates at load time
try:
import gradio.routes as _gr
if hasattr(_gr, 'templates') and _gr.templates is not None:
_gr.templates.__class__ = _PatchedJinja2Templates
except Exception:
pass
# ═══════════════════════════════════════════════════════════════════════════════
# MAIN APP
# ═══════════════════════════════════════════════════════════════════════════════
import gradio as gr
import re, os, tempfile
from huggingface_hub import InferenceClient
HF_TOKEN = os.environ.get("HF_TOKEN", "")
MODEL_ID = "google/gemma-2-9b-it"
client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
# ── SRT helpers ────────────────────────────────────────────────────────────────
def parse_srt(content: str) -> list:
blocks = []
for block in re.split(r'\n\s*\n', content.strip()):
lines = block.strip().splitlines()
if len(lines) < 2 or not lines[0].strip().isdigit() or '-->' not in lines[1]:
continue
blocks.append({
'number': int(lines[0].strip()),
'timecode': lines[1].strip(),
'text': '\n'.join(lines[2:]).strip(),
})
return blocks
def blocks_to_srt(blocks: list) -> str:
return '\n\n'.join(
f"{i}\n{b['timecode']}\n{b['text']}"
for i, b in enumerate(blocks, 1)
) + '\n'
# ── Detection ──────────────────────────────────────────────────────────────────
REMOVE_RE = re.compile(
r'https?://\S+|www\.\S+\.\S+'
r'|subscene|opensubtitles|addic7ed|podnapisi|subdownloader'
r'|subtitles?\s+by\s+\w+|translated\s+by\s+\w+'
r'|encoded\s+by|synced\s+by|ripped\s+by|corrected\s+by'
r'|\bcopyright\b|Β©|all\s+rights\s+reserved'
r'|downloaded\s+from|subtitle\s+group',
re.IGNORECASE
)
def is_metadata(text: str) -> bool:
return bool(REMOVE_RE.search(text))
def gemma_classify(text: str) -> str:
if not text.strip():
return 'REMOVE'
prompt = (
"<start_of_turn>user\n"
"Is this subtitle text metadata (credits/copyright/website/promo) or real dialogue?\n"
"Reply ONLY 'REMOVE' or 'KEEP'.\n\nText:\n" + text +
"\n<end_of_turn>\n<start_of_turn>model\n"
)
try:
r = client.text_generation(prompt, max_new_tokens=10, temperature=0.1, do_sample=False)
return 'REMOVE' if 'REMOVE' in r.strip().upper() else 'KEEP'
except Exception as e:
print(f"[Gemma] {e}")
return 'KEEP'
# ── Core ───────────────────────────────────────────────────────────────────────
def process(content: str, use_gemma: bool):
blocks = parse_srt(content)
if not blocks:
return "", "⚠️ ΰΆšΰ·’ΰ·ƒΰ·’ΰΆΈ subtitle block ΰΆ‘ΰΆšΰΆšΰ·Š ΰ·„ΰΆΈΰ·” ΰΆ±ΰ·œΰ·€ΰ·“ΰΆΊ."
kept, log = [], []
for b in blocks:
t = b['text']
if is_metadata(t):
log.append(f"#{b['number']} [REGEX]: {t[:80]}")
continue
if use_gemma and t.strip() and gemma_classify(t) == 'REMOVE':
log.append(f"#{b['number']} [GEMMA]: {t[:80]}")
continue
kept.append(b)
report = [
f"βœ… ΰ·ƒΰΆΈΰ·ŠΰΆ΄ΰ·–ΰΆ»ΰ·ŠΰΆ« blocks: {len(blocks)}",
f"πŸ—‘οΈ ΰΆ‰ΰ·€ΰΆ­ΰ·Š ΰΆšΰ·… blocks: {len(blocks)-len(kept)}",
f"βœ”οΈ ΰΆ‰ΰΆ­ΰ·’ΰΆ»ΰ·’ blocks: {len(kept)}", "",
]
if log:
report += ["ΰΆ‰ΰ·€ΰΆ­ΰ·Š ΰΆšΰ·… lines:"] + [f" β€’ {l}" for l in log]
return blocks_to_srt(kept), '\n'.join(report)
def save_tmp(text: str) -> str:
f = tempfile.NamedTemporaryFile(mode='w', suffix='_cleaned.srt',
encoding='utf-8', delete=False)
f.write(text); f.close()
return f.name
# ── Handlers ───────────────────────────────────────────────────────────────────
def handle_file(path, use_gemma):
if not path:
return "", "ΰΆœΰ·œΰΆ±ΰ·”ΰ·€ΰΆšΰ·Š ࢭෝࢻࢱ්ࢱ.", None
try:
with open(path, encoding='utf-8-sig', errors='replace') as f:
content = f.read()
except Exception as e:
return "", f"❌ {e}", None
cleaned, report = process(content, use_gemma)
return cleaned, report, save_tmp(cleaned) if cleaned else None
def handle_text(text, use_gemma):
if not text.strip():
return "", "⚠️ SRT content ΰΆ‡ΰΆ­ΰ·”ΰ·…ΰ·” ࢚ࢻࢱ්ࢱ.", None
cleaned, report = process(text, use_gemma)
return cleaned, report, save_tmp(cleaned) if cleaned else None
# ── UI ─────────────────────────────────────────────────────────────────────────
css = ".mono{font-family:monospace;font-size:13px} footer{display:none!important}"
with gr.Blocks(title="🎬 Sinhala SRT Cleaner", theme=gr.themes.Soft(), css=css) as demo:
gr.Markdown("# 🎬 ΰ·ƒΰ·’ΰΆ‚ΰ·„ΰΆ½ SRT Subtitle Cleaner\n**Gemma 2 9B** ࢷාවිࢭා ࢚ࢻ author credits, website URLs, copyright notices ΰΆ‰ΰ·€ΰΆ­ΰ·Š ΰΆšΰΆ»ΰΆΊΰ·’.")
with gr.Tabs():
with gr.TabItem("πŸ“ ΰΆœΰ·œΰΆ±ΰ·” Upload"):
with gr.Row():
with gr.Column(scale=1):
file_in = gr.File(label="SRT ΰΆœΰ·œΰΆ±ΰ·”ΰ·€", file_types=[".srt"], type="filepath")
gemma_cb1 = gr.Checkbox(label="πŸ€– Gemma AI", value=True,
info="Uncheck β†’ Regex only (ΰΆ‰ΰΆšΰ·ŠΰΆΈΰΆ±ΰ·Š)")
btn1 = gr.Button("πŸš€ Process", variant="primary", size="lg")
with gr.Column(scale=2):
report1 = gr.Textbox(label="πŸ“Š Report", lines=8, interactive=False)
preview1 = gr.Textbox(label="βœ… Cleaned Preview", lines=15,
interactive=False, elem_classes="mono")
dl1 = gr.File(label="⬇️ Download")
btn1.click(fn=handle_file, inputs=[file_in, gemma_cb1],
outputs=[preview1, report1, dl1])
with gr.TabItem("πŸ“ Text Paste"):
with gr.Row():
with gr.Column():
text_in = gr.Textbox(label="SRT Content", lines=15, elem_classes="mono",
placeholder="1\n00:00:01,000 --> 00:00:04,000\nExample...")
gemma_cb2 = gr.Checkbox(label="πŸ€– Gemma AI", value=True)
btn2 = gr.Button("πŸš€ Process", variant="primary")
with gr.Column():
report2 = gr.Textbox(label="πŸ“Š Report", lines=6, interactive=False)
preview2 = gr.Textbox(label="βœ… Cleaned Output", lines=15,
interactive=False, elem_classes="mono")
dl2 = gr.File(label="⬇️ Download")
btn2.click(fn=handle_text, inputs=[text_in, gemma_cb2],
outputs=[preview2, report2, dl2])
with gr.TabItem("πŸ”Œ API"):
gr.Markdown("""
## API Endpoint
### Python β€” File
```python
from gradio_client import Client
client = Client("your-username/srt-cleaner")
cleaned, report, file = client.predict(
"path/to/file.srt", True, api_name="/handle_file"
)
```
### Python β€” Text
```python
cleaned, report, file = client.predict(
"1\\n00:00:01,000 --> 00:00:04,000\\nwww.subscene.com\\n\\n2\\n...",
True, api_name="/handle_text"
)
```
### cURL
```bash
curl -X POST "https://your-space.hf.space/run/predict" \\
-H "Content-Type: application/json" \\
-d '{"fn_index":1,"data":["SRT_CONTENT",true]}'
```
""")
gr.Markdown("---\n**Setup:** Space Settings β†’ Secrets β†’ `HF_TOKEN` set ࢚ࢻࢱ්ࢱ.")
demo.queue(max_size=10)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)