Spaces:

Phalakk
/

charsim

Sleeping

App Files Files Community

charsim / app.py

Phalakk

Update app.py

615aca5 verified 8 days ago

raw

history blame contribute delete

11.7 kB

	"""
	Character Similarity Engine — Deployment App
	=============================================
	Gradio UI + FAISS vector search in a single file.

	Local run:
	pip install gradio faiss-cpu sentence-transformers
	python deployment/app.py

	HuggingFace Spaces:
	Upload this file + requirements.txt to a Gradio Space.
	Set MODEL_SOURCE = "hub" and push your model to HuggingFace Hub first.
	"""

	import json, ast
	import numpy as np
	from pathlib import Path
	import gradio as gr
	import faiss
	from sentence_transformers import SentenceTransformer

	# ── Config ─────────────────────────────────────────────────────────────────────
	import os
	ON_SPACES = os.getenv("SPACE_ID") is not None

	if ON_SPACES:
	MODEL_DIR = "Phalakk/charsim-model"
	PROC_DIR = Path(__file__).resolve().parent
	INDEX_DIR = Path(__file__).resolve().parent
	else:
	_ROOT = Path(__file__).resolve().parent.parent
	MODEL_DIR = _ROOT / "models" / "charsim_final"
	PROC_DIR = _ROOT / "data" / "processed"
	INDEX_DIR = Path(__file__).resolve().parent

	INDEX_PATH = INDEX_DIR / "char_index.faiss"
	META_PATH = INDEX_DIR / "char_meta.jsonl"


	# ══════════════════════════════════════════════════════════════════════════════
	# STARTUP — Load model + build/load FAISS index
	# ══════════════════════════════════════════════════════════════════════════════

	def parse_genre(g):
	if isinstance(g, list): return g
	try:
	return ast.literal_eval(g)
	except:
	return [g] if g else []


	print("=" * 50)
	print(" Character Similarity Engine — Starting up")
	print("=" * 50)

	# Load model
	print(f"\n[1/3] Loading model from {MODEL_DIR}...")

	model = SentenceTransformer(MODEL_DIR if ON_SPACES else str(MODEL_DIR))
	print(" Model loaded.")

	# Load characters — filter out [Story] fallback entries
	print(f"[2/3] Loading characters from chars.jsonl...")
	characters = []
	with open(PROC_DIR / "chars.jsonl") as f:
	for line in f:
	c = json.loads(line)
	if c["name"].startswith("[Story]"): # skip movie-level fallbacks
	continue
	if len(c.get("description", "")) < 60:
	continue
	c["genre_parsed"] = parse_genre(c.get("genre", []))
	characters.append(c)
	print(f" {len(characters):,} characters loaded.")

	# Build or load FAISS index
	if INDEX_PATH.exists():
	print(f"[3/3] Loading FAISS index from {INDEX_PATH}...")
	index = faiss.read_index(str(INDEX_PATH))
	print(f" Index loaded: {index.ntotal:,} vectors.")
	else:
	print(f"[3/3] Building FAISS index ({len(characters):,} characters)...")
	print(" This takes ~5 min on first run, then saves for reuse.")
	descriptions = [c["description"] for c in characters]
	embeddings = model.encode(
	descriptions,
	batch_size=64,
	show_progress_bar=True,
	normalize_embeddings=True, # needed for cosine similarity via inner product
	)
	dim = embeddings.shape[1]
	index = faiss.IndexFlatIP(dim) # inner product = cosine sim when normalized
	index.add(embeddings.astype(np.float32))

	faiss.write_index(index, str(INDEX_PATH))
	# Save filtered metadata alongside index
	with open(META_PATH, "w") as f:
	for c in characters:
	f.write(json.dumps(c) + "\n")
	print(f" Index saved to {INDEX_PATH}")

	# Build name lookup for character search
	name_index = {}
	for i, c in enumerate(characters):
	name_lower = c["name"].lower()
	if name_lower not in name_index:
	name_index[name_lower] = i
	# also index by first name
	first = c["name"].split()[0].lower()
	if first not in name_index:
	name_index[first] = i

	print("\n Ready! Starting Gradio UI...\n")


	# ══════════════════════════════════════════════════════════════════════════════
	# SEARCH
	# ══════════════════════════════════════════════════════════════════════════════

	def search(query_text: str, top_k: int = 6, exclude_id: str = None):
	"""
	Encode query → FAISS search → return top_k results.
	Excludes the query character itself if exclude_id is set.
	"""
	emb = model.encode(
	[query_text],
	normalize_embeddings=True,
	convert_to_numpy=True
	).astype(np.float32)

	scores, indices = index.search(emb, top_k + 5) # fetch extra to filter

	results = []
	for score, idx in zip(scores[0], indices[0]):
	if idx < 0 or idx >= len(characters):
	continue
	c = characters[idx]
	if exclude_id and c["id"] == exclude_id:
	continue
	results.append({
	"name": c["name"],
	"movie": c["movie"],
	"year": c.get("year", ""),
	"genre": ", ".join(c["genre_parsed"][:2]),
	"similarity": round(float(score), 3),
	"description": c["description"][:250],
	"id": c["id"],
	})
	if len(results) == top_k:
	break
	return results


	# ══════════════════════════════════════════════════════════════════════════════
	# GRADIO HANDLER
	# ══════════════════════════════════════════════════════════════════════════════

	def handle_search(character_name: str, custom_desc: str, top_k: int) -> str:
	character_name = character_name.strip()
	custom_desc = custom_desc.strip()

	if not character_name and not custom_desc:
	return "⚠️ Please enter a character name or a description."

	anchor_info = ""
	exclude_id = None

	# Determine query text
	if custom_desc:
	query_text = custom_desc
	anchor_info = f"🔍 Query description: _{query_text[:120]}..._\n\n"
	else:
	# Find character by name
	name_key = character_name.lower()
	first_key = character_name.split()[0].lower()
	char_idx = name_index.get(name_key) or name_index.get(first_key)

	if char_idx is None:
	# Fuzzy fallback — find closest name
	matches = [c for c in characters
	if character_name.lower() in c["name"].lower()]
	if not matches:
	return (f"❌ '{character_name}' not found in the database.\n\n"
	f"Try using the description box instead — describe the "
	f"character in your own words.")
	char_idx = characters.index(matches[0])

	anchor = characters[char_idx]
	query_text = anchor["description"]
	exclude_id = anchor["id"]
	genres_str = ", ".join(anchor["genre_parsed"][:3]) or "unknown"
	anchor_info = (
	f"## 🎬 {anchor['name']}\n"
	f"Movie/Show: {anchor['movie']} ({anchor.get('year', '')})\n"
	f"Genre: {genres_str}\n\n"
	f"_{anchor['description'][:200]}..._\n\n"
	f"---\n\n"
	)

	# Search
	results = search(query_text, top_k=int(top_k), exclude_id=exclude_id)

	if not results:
	return "No similar characters found. Try a different query."

	output = anchor_info + "## 🔗 Most Similar Characters\n\n"

	for i, r in enumerate(results, 1):
	bar = "█" * round(r["similarity"] * 10) + "░" * (10 - round(r["similarity"] * 10))
	year = f" ({r['year']})" if r["year"] else ""
	genre = f" · _{r['genre']}_" if r["genre"] else ""
	output += (
	f"### {i}. {r['name']}\n"
	f"{r['movie']}{year}{genre}\n\n"
	f"`{bar}` {r['similarity']:.3f}\n\n"
	f"{r['description']}...\n\n"
	f"---\n\n"
	)

	return output


	# ══════════════════════════════════════════════════════════════════════════════
	# GRADIO UI
	# ══════════════════════════════════════════════════════════════════════════════

	with gr.Blocks(
	title="Character Similarity Engine",
	theme=gr.themes.Soft(),
	) as demo:
	gr.Markdown("""
	# 🎬 Character Similarity Engine
	Find the most similar characters across 34,000+ movies and shows.

	Powered by a fine-tuned `all-MiniLM-L6-v2` model (Spearman: 0.77) trained on 88,000 character descriptions.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	char_input = gr.Textbox(
	label="🔎 Character Name",
	placeholder="e.g. Sherlock Holmes, Indiana Jones, Dracula...",
	)
	gr.Markdown("or")
	desc_input = gr.Textbox(
	label="📝 Describe a character (free text)",
	placeholder="A ruthless crime boss who started as a small-time thug...",
	lines=4,
	)
	top_k_slider = gr.Slider(
	minimum=3, maximum=10, value=5, step=1,
	label="Number of results"
	)
	search_btn = gr.Button("Find Similar Characters 🚀", variant="primary")

	with gr.Column(scale=2):
	output_md = gr.Markdown(value="Results will appear here...")

	gr.Examples(
	label="💡 Try these examples",
	examples=[
	["Dracula", "", 5],
	["Sherlock Holmes", "", 5],
	["Indiana Jones", "", 5],
	["", "A brilliant scientist who becomes a monster after a failed experiment", 5],
	["", "A young orphan who discovers they have magical powers and must save the world", 5],
	["", "A hardboiled detective in a corrupt city who plays by his own rules", 5],
	["", "A charismatic villain obsessed with power and revenge", 5],
	],
	inputs=[char_input, desc_input, top_k_slider],
	)

	search_btn.click(
	fn=handle_search,
	inputs=[char_input, desc_input, top_k_slider],
	outputs=output_md,
	)
	# Also trigger on Enter in text boxes
	char_input.submit(fn=handle_search,
	inputs=[char_input, desc_input, top_k_slider],
	outputs=output_md)
	desc_input.submit(fn=handle_search,
	inputs=[char_input, desc_input, top_k_slider],
	outputs=output_md)

	gr.Markdown("""
	---
	Built with sentence-transformers · FAISS · Gradio · Fine-tuned on Wikipedia movie plots
	""")

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False, # set True for a temporary public URL
	show_error=True,
	)