Spaces:

mrpoddaa
/

Subtitiles

No application file

App Files Files Community

Subtitiles / app.py

mrpoddaa

Create app.py

de07519 verified about 1 month ago

Raw

History Blame Contribute Delete

11.3 kB

	# ═══════════════════════════════════════════════════════════════════════════════
	# COMPATIBILITY PATCHES — must run before any other import
	# ═══════════════════════════════════════════════════════════════════════════════

	import sys, types

	# 1) Python 3.13 removed audioop; pydub needs it via gradio 4.44
	for _m in ("audioop", "pyaudioop"):
	if _m not in sys.modules:
	sys.modules[_m] = types.ModuleType(_m)

	# 2) gradio_client schema patches (bool/non-dict schema guard)
	import gradio_client.utils as _gcu

	_orig_get_type = _gcu.get_type
	_orig_inner = _gcu._json_schema_to_python_type
	_orig_j2p = _gcu.json_schema_to_python_type

	def _safe_get_type(schema):
	if not isinstance(schema, dict): return {}
	return _orig_get_type(schema)

	def _safe_inner(schema, defs=None):
	if not isinstance(schema, dict): return "Any"
	return _orig_inner(schema, defs)

	def _safe_j2p(schema):
	if not isinstance(schema, dict): return "Any"
	return _orig_j2p(schema)

	_gcu.get_type = _safe_get_type
	_gcu._json_schema_to_python_type = _safe_inner
	_gcu.json_schema_to_python_type = _safe_j2p

	# 3) FIX: gradio 4.44.0 calls templates.TemplateResponse(dict, dict) — passes
	# the context dict as the first arg (template name) instead of a string.
	# Starlette's TemplateResponse signature is (name: str, context: dict, ...).
	# We patch TemplateResponse to detect the swapped args and correct them.
	import starlette.templating as _st

	_OrigJinja2Templates = _st.Jinja2Templates

	class _PatchedJinja2Templates(_OrigJinja2Templates):
	def TemplateResponse(self, args, *kwargs):
	# Gradio 4.44 bug: passes (dict_context, dict_context) — first arg is
	# a dict instead of the template name string.
	if args and isinstance(args[0], dict):
	ctx = args[0]
	# The real template name lives in the context under "request" scope;
	# gradio always uses "index.html" as its main template.
	name = ctx.pop("__template_name__", None) or "index.html"
	# Reconstruct as (name, context, *rest)
	args = (name, ctx) + args[1:]
	return super().TemplateResponse(args, *kwargs)

	# Patch into starlette.templating so any already-imported reference is updated
	_st.Jinja2Templates = _PatchedJinja2Templates

	# Also patch it into gradio.routes which imported Jinja2Templates at load time
	try:
	import gradio.routes as _gr
	if hasattr(_gr, 'templates') and _gr.templates is not None:
	_gr.templates.__class__ = _PatchedJinja2Templates
	except Exception:
	pass

	# ═══════════════════════════════════════════════════════════════════════════════
	# MAIN APP
	# ═══════════════════════════════════════════════════════════════════════════════

	import gradio as gr
	import re, os, tempfile
	from huggingface_hub import InferenceClient

	HF_TOKEN = os.environ.get("HF_TOKEN", "")
	MODEL_ID = "google/gemma-2-9b-it"
	client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)

	# ── SRT helpers ────────────────────────────────────────────────────────────────

	def parse_srt(content: str) -> list:
	blocks = []
	for block in re.split(r'\n\s*\n', content.strip()):
	lines = block.strip().splitlines()
	if len(lines) < 2 or not lines[0].strip().isdigit() or '-->' not in lines[1]:
	continue
	blocks.append({
	'number': int(lines[0].strip()),
	'timecode': lines[1].strip(),
	'text': '\n'.join(lines[2:]).strip(),
	})
	return blocks

	def blocks_to_srt(blocks: list) -> str:
	return '\n\n'.join(
	f"{i}\n{b['timecode']}\n{b['text']}"
	for i, b in enumerate(blocks, 1)
	) + '\n'

	# ── Detection ──────────────────────────────────────────────────────────────────

	REMOVE_RE = re.compile(
	r'https?://\S+\|www\.\S+\.\S+'
	r'\|subscene\|opensubtitles\|addic7ed\|podnapisi\|subdownloader'
	r'\|subtitles?\s+by\s+\w+\|translated\s+by\s+\w+'
	r'\|encoded\s+by\|synced\s+by\|ripped\s+by\|corrected\s+by'
	r'\|\bcopyright\b\|©\|all\s+rights\s+reserved'
	r'\|downloaded\s+from\|subtitle\s+group',
	re.IGNORECASE
	)

	def is_metadata(text: str) -> bool:
	return bool(REMOVE_RE.search(text))

	def gemma_classify(text: str) -> str:
	if not text.strip():
	return 'REMOVE'
	prompt = (
	"<start_of_turn>user\n"
	"Is this subtitle text metadata (credits/copyright/website/promo) or real dialogue?\n"
	"Reply ONLY 'REMOVE' or 'KEEP'.\n\nText:\n" + text +
	"\n<end_of_turn>\n<start_of_turn>model\n"
	)
	try:
	r = client.text_generation(prompt, max_new_tokens=10, temperature=0.1, do_sample=False)
	return 'REMOVE' if 'REMOVE' in r.strip().upper() else 'KEEP'
	except Exception as e:
	print(f"[Gemma] {e}")
	return 'KEEP'

	# ── Core ───────────────────────────────────────────────────────────────────────

	def process(content: str, use_gemma: bool):
	blocks = parse_srt(content)
	if not blocks:
	return "", "⚠️ කිසිම subtitle block එකක් හමු නොවීය."

	kept, log = [], []
	for b in blocks:
	t = b['text']
	if is_metadata(t):
	log.append(f"#{b['number']} [REGEX]: {t[:80]}")
	continue
	if use_gemma and t.strip() and gemma_classify(t) == 'REMOVE':
	log.append(f"#{b['number']} [GEMMA]: {t[:80]}")
	continue
	kept.append(b)

	report = [
	f"✅ සම්පූර්ණ blocks: {len(blocks)}",
	f"🗑️ ඉවත් කළ blocks: {len(blocks)-len(kept)}",
	f"✔️ ඉතිරි blocks: {len(kept)}", "",
	]
	if log:
	report += ["ඉවත් කළ lines:"] + [f" • {l}" for l in log]

	return blocks_to_srt(kept), '\n'.join(report)

	def save_tmp(text: str) -> str:
	f = tempfile.NamedTemporaryFile(mode='w', suffix='_cleaned.srt',
	encoding='utf-8', delete=False)
	f.write(text); f.close()
	return f.name

	# ── Handlers ───────────────────────────────────────────────────────────────────

	def handle_file(path, use_gemma):
	if not path:
	return "", "ගොනුවක් තෝරන්න.", None
	try:
	with open(path, encoding='utf-8-sig', errors='replace') as f:
	content = f.read()
	except Exception as e:
	return "", f"❌ {e}", None
	cleaned, report = process(content, use_gemma)
	return cleaned, report, save_tmp(cleaned) if cleaned else None

	def handle_text(text, use_gemma):
	if not text.strip():
	return "", "⚠️ SRT content ඇතුළු කරන්න.", None
	cleaned, report = process(text, use_gemma)
	return cleaned, report, save_tmp(cleaned) if cleaned else None

	# ── UI ─────────────────────────────────────────────────────────────────────────

	css = ".mono{font-family:monospace;font-size:13px} footer{display:none!important}"

	with gr.Blocks(title="🎬 Sinhala SRT Cleaner", theme=gr.themes.Soft(), css=css) as demo:

	gr.Markdown("# 🎬 සිංහල SRT Subtitle Cleaner\nGemma 2 9B භාවිතා කර author credits, website URLs, copyright notices ඉවත් කරයි.")

	with gr.Tabs():

	with gr.TabItem("📁 ගොනු Upload"):
	with gr.Row():
	with gr.Column(scale=1):
	file_in = gr.File(label="SRT ගොනුව", file_types=[".srt"], type="filepath")
	gemma_cb1 = gr.Checkbox(label="🤖 Gemma AI", value=True,
	info="Uncheck → Regex only (ඉක්මන්)")
	btn1 = gr.Button("🚀 Process", variant="primary", size="lg")
	with gr.Column(scale=2):
	report1 = gr.Textbox(label="📊 Report", lines=8, interactive=False)
	preview1 = gr.Textbox(label="✅ Cleaned Preview", lines=15,
	interactive=False, elem_classes="mono")
	dl1 = gr.File(label="⬇️ Download")
	btn1.click(fn=handle_file, inputs=[file_in, gemma_cb1],
	outputs=[preview1, report1, dl1])

	with gr.TabItem("📝 Text Paste"):
	with gr.Row():
	with gr.Column():
	text_in = gr.Textbox(label="SRT Content", lines=15, elem_classes="mono",
	placeholder="1\n00:00:01,000 --> 00:00:04,000\nExample...")
	gemma_cb2 = gr.Checkbox(label="🤖 Gemma AI", value=True)
	btn2 = gr.Button("🚀 Process", variant="primary")
	with gr.Column():
	report2 = gr.Textbox(label="📊 Report", lines=6, interactive=False)
	preview2 = gr.Textbox(label="✅ Cleaned Output", lines=15,
	interactive=False, elem_classes="mono")
	dl2 = gr.File(label="⬇️ Download")
	btn2.click(fn=handle_text, inputs=[text_in, gemma_cb2],
	outputs=[preview2, report2, dl2])

	with gr.TabItem("🔌 API"):
	gr.Markdown("""
	## API Endpoint
	### Python — File
	```python
	from gradio_client import Client
	client = Client("your-username/srt-cleaner")
	cleaned, report, file = client.predict(
	"path/to/file.srt", True, api_name="/handle_file"
	)
	```
	### Python — Text
	```python
	cleaned, report, file = client.predict(
	"1\\n00:00:01,000 --> 00:00:04,000\\nwww.subscene.com\\n\\n2\\n...",
	True, api_name="/handle_text"
	)
	```
	### cURL
	```bash
	curl -X POST "https://your-space.hf.space/run/predict" \\
	-H "Content-Type: application/json" \\
	-d '{"fn_index":1,"data":["SRT_CONTENT",true]}'
	```
	""")

	gr.Markdown("---\nSetup: Space Settings → Secrets → `HF_TOKEN` set කරන්න.")

	demo.queue(max_size=10)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)