Spaces:

NIKKI77
/

ks-version-1.0

Paused

App Files Files Community

ks-version-1.0 / app.py

NIKKI77

Subtitle KIS v1.0 – initial

36b0811 8 months ago

raw

history blame contribute delete

5.02 kB

	from flask import Flask, render_template, request
	import os, html, pandas as pd, pickle, re
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	app = Flask(__name__)

	# Health check (for HF Spaces readiness)
	@app.get("/health")
	def health():
	return {"ok": True}, 200

	# Path

	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	VECT_PATH = os.path.join(BASE_DIR, 'tfidf_vectorizer.pkl')
	MATR_PATH = os.path.join(BASE_DIR, 'tfidf_matrix.pkl')
	META_PATH = os.path.join(BASE_DIR, 'indexed_metadata.csv')

	# Load artifacts

	with open(VECT_PATH, 'rb') as f:
	vectorizer = pickle.load(f)
	with open(MATR_PATH, 'rb') as f:
	tfidf_matrix = pickle.load(f)
	metadata = pd.read_csv(META_PATH)

	# columns available (supports transition)
	CLEAN_COL = 'Subtitle Text Clean' if 'Subtitle Text Clean' in metadata.columns else 'Subtitle Text'
	RAW_COL = 'Subtitle Text Raw' if 'Subtitle Text Raw' in metadata.columns else CLEAN_COL

	# Utils
	_space_re = re.compile(r'\s+')

	def normalize_minimal(text: str) -> str:
	"""Lowercase + collapse spaces. No stopword removal. No punctuation strip."""
	if not isinstance(text, str):
	return ''
	t = text.lower()
	t = _space_re.sub(' ', t).strip()
	return t

	def highlight_exact_phrase(text: str, phrase: str) -> str:
	"""
	Escape HTML then highlight the exact query as a whole word/phrase (case-insensitive).
	Prevents 'ai' from highlighting inside 'domain'.
	"""
	safe = html.escape(text or '')
	p = (phrase or '').strip()
	if not p:
	return safe

	if ' ' not in p:
	pattern = re.compile(rf'(?<![A-Za-z0-9_]){re.escape(p)}(?![A-Za-z0-9_])', re.IGNORECASE)
	else:
	parts = [re.escape(w) for w in re.split(r'\s+', p)]
	inner = r'\s+'.join(parts)
	pattern = re.compile(rf'(?<![A-Za-z0-9_]){inner}(?![A-Za-z0-9_])', re.IGNORECASE)

	return pattern.sub(r'<mark>\g<0></mark>', safe)

	def timestamp_to_seconds(timestamp):
	try:
	h, m, s = timestamp.split(':')
	return int(h) * 3600 + int(m) * 60 + float(s)
	except Exception:
	return 0

	# Core search
	def search_subtitles(query):
	"""
	Returns (results_df, total_hits)
	- Results are ALL positive-similarity matches, sorted by relevance (desc).
	- Context uses RAW text; highlight is exact-phrase on the match line only.
	"""
	raw_query = (query or '').strip()
	q_norm = normalize_minimal(raw_query)
	if not q_norm:
	return pd.DataFrame(), 0

	sims = cosine_similarity(vectorizer.transform([q_norm]), tfidf_matrix).flatten()
	order = np.argsort(sims)[::-1]

	# Keep only positive-similarity hits
	eps = 1e-12
	pos_order = order[sims[order] > eps]
	total_hits = int(pos_order.size)
	if total_hits == 0:
	return pd.DataFrame(), 0

	# Build all results
	results = metadata.iloc[pos_order].copy()

	links, thumbs, titles, times, contexts, scores = [], [], [], [], [], []
	for row_idx in pos_order:
	row = metadata.iloc[row_idx]
	vid = row.get('YouTube ID', '')
	start_time = row.get('Start Time', '0:00:00')
	secs = max(0, timestamp_to_seconds(start_time) - 2)

	links.append(f"https://www.youtube.com/watch?v={vid}&t={secs}s" if vid else '')
	times.append(start_time)
	thumbs.append(f"https://img.youtube.com/vi/{vid}/hqdefault.jpg" if vid else '')
	titles.append((row.get('Video Title') or '').title())
	scores.append(float(sims[row_idx]))

	# RAW context with same-video guard
	before_raw = after_raw = ''
	if row_idx > 0 and metadata.iloc[row_idx - 1].get('YouTube ID', '') == vid:
	before_raw = metadata.iloc[row_idx - 1].get(RAW_COL, '')
	if row_idx + 1 < len(metadata) and metadata.iloc[row_idx + 1].get('YouTube ID', '') == vid:
	after_raw = metadata.iloc[row_idx + 1].get(RAW_COL, '')

	match_raw = row.get(RAW_COL, '')
	match_hl = highlight_exact_phrase(match_raw, raw_query)

	context = f"{html.escape(before_raw or '')}\n{match_hl}\n{html.escape(after_raw or '')}"
	contexts.append(context)

	results['Context Block'] = contexts
	results['Jump Link'] = links
	results['Thumbnail'] = thumbs
	results['Video Title Clean'] = titles
	results['Readable Time'] = times
	results['Similarity Score'] = scores # handy for debugging/thresholds

	return results, total_hits

	# Routes
	@app.route("/", methods=["GET", "POST"])
	def index():
	results = None
	query = ""
	total_hits = 0
	if request.method == "POST":
	query = request.form.get("query", "")
	if query.strip():
	df, total_hits = search_subtitles(query)
	results = df.to_dict(orient="records") if not df.empty else []
	return render_template("index.html", results=results, query=query, total_hits=total_hits)

	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860))
	app.run(host="0.0.0.0", port=port, debug=False)