charsim / app.py
Phalakk's picture
Update app.py
615aca5 verified
"""
Character Similarity Engine β€” Deployment App
=============================================
Gradio UI + FAISS vector search in a single file.
Local run:
pip install gradio faiss-cpu sentence-transformers
python deployment/app.py
HuggingFace Spaces:
Upload this file + requirements.txt to a Gradio Space.
Set MODEL_SOURCE = "hub" and push your model to HuggingFace Hub first.
"""
import json, ast
import numpy as np
from pathlib import Path
import gradio as gr
import faiss
from sentence_transformers import SentenceTransformer
# ── Config ─────────────────────────────────────────────────────────────────────
import os
ON_SPACES = os.getenv("SPACE_ID") is not None
if ON_SPACES:
MODEL_DIR = "Phalakk/charsim-model"
PROC_DIR = Path(__file__).resolve().parent
INDEX_DIR = Path(__file__).resolve().parent
else:
_ROOT = Path(__file__).resolve().parent.parent
MODEL_DIR = _ROOT / "models" / "charsim_final"
PROC_DIR = _ROOT / "data" / "processed"
INDEX_DIR = Path(__file__).resolve().parent
INDEX_PATH = INDEX_DIR / "char_index.faiss"
META_PATH = INDEX_DIR / "char_meta.jsonl"
# ══════════════════════════════════════════════════════════════════════════════
# STARTUP β€” Load model + build/load FAISS index
# ══════════════════════════════════════════════════════════════════════════════
def parse_genre(g):
if isinstance(g, list): return g
try:
return ast.literal_eval(g)
except:
return [g] if g else []
print("=" * 50)
print(" Character Similarity Engine β€” Starting up")
print("=" * 50)
# Load model
print(f"\n[1/3] Loading model from {MODEL_DIR}...")
model = SentenceTransformer(MODEL_DIR if ON_SPACES else str(MODEL_DIR))
print(" Model loaded.")
# Load characters β€” filter out [Story] fallback entries
print(f"[2/3] Loading characters from chars.jsonl...")
characters = []
with open(PROC_DIR / "chars.jsonl") as f:
for line in f:
c = json.loads(line)
if c["name"].startswith("[Story]"): # skip movie-level fallbacks
continue
if len(c.get("description", "")) < 60:
continue
c["genre_parsed"] = parse_genre(c.get("genre", []))
characters.append(c)
print(f" {len(characters):,} characters loaded.")
# Build or load FAISS index
if INDEX_PATH.exists():
print(f"[3/3] Loading FAISS index from {INDEX_PATH}...")
index = faiss.read_index(str(INDEX_PATH))
print(f" Index loaded: {index.ntotal:,} vectors.")
else:
print(f"[3/3] Building FAISS index ({len(characters):,} characters)...")
print(" This takes ~5 min on first run, then saves for reuse.")
descriptions = [c["description"] for c in characters]
embeddings = model.encode(
descriptions,
batch_size=64,
show_progress_bar=True,
normalize_embeddings=True, # needed for cosine similarity via inner product
)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim) # inner product = cosine sim when normalized
index.add(embeddings.astype(np.float32))
faiss.write_index(index, str(INDEX_PATH))
# Save filtered metadata alongside index
with open(META_PATH, "w") as f:
for c in characters:
f.write(json.dumps(c) + "\n")
print(f" Index saved to {INDEX_PATH}")
# Build name lookup for character search
name_index = {}
for i, c in enumerate(characters):
name_lower = c["name"].lower()
if name_lower not in name_index:
name_index[name_lower] = i
# also index by first name
first = c["name"].split()[0].lower()
if first not in name_index:
name_index[first] = i
print("\n Ready! Starting Gradio UI...\n")
# ══════════════════════════════════════════════════════════════════════════════
# SEARCH
# ══════════════════════════════════════════════════════════════════════════════
def search(query_text: str, top_k: int = 6, exclude_id: str = None):
"""
Encode query β†’ FAISS search β†’ return top_k results.
Excludes the query character itself if exclude_id is set.
"""
emb = model.encode(
[query_text],
normalize_embeddings=True,
convert_to_numpy=True
).astype(np.float32)
scores, indices = index.search(emb, top_k + 5) # fetch extra to filter
results = []
for score, idx in zip(scores[0], indices[0]):
if idx < 0 or idx >= len(characters):
continue
c = characters[idx]
if exclude_id and c["id"] == exclude_id:
continue
results.append({
"name": c["name"],
"movie": c["movie"],
"year": c.get("year", ""),
"genre": ", ".join(c["genre_parsed"][:2]),
"similarity": round(float(score), 3),
"description": c["description"][:250],
"id": c["id"],
})
if len(results) == top_k:
break
return results
# ══════════════════════════════════════════════════════════════════════════════
# GRADIO HANDLER
# ══════════════════════════════════════════════════════════════════════════════
def handle_search(character_name: str, custom_desc: str, top_k: int) -> str:
character_name = character_name.strip()
custom_desc = custom_desc.strip()
if not character_name and not custom_desc:
return "⚠️ Please enter a character name or a description."
anchor_info = ""
exclude_id = None
# Determine query text
if custom_desc:
query_text = custom_desc
anchor_info = f"πŸ” **Query description:** _{query_text[:120]}..._\n\n"
else:
# Find character by name
name_key = character_name.lower()
first_key = character_name.split()[0].lower()
char_idx = name_index.get(name_key) or name_index.get(first_key)
if char_idx is None:
# Fuzzy fallback β€” find closest name
matches = [c for c in characters
if character_name.lower() in c["name"].lower()]
if not matches:
return (f"❌ **'{character_name}' not found** in the database.\n\n"
f"Try using the description box instead β€” describe the "
f"character in your own words.")
char_idx = characters.index(matches[0])
anchor = characters[char_idx]
query_text = anchor["description"]
exclude_id = anchor["id"]
genres_str = ", ".join(anchor["genre_parsed"][:3]) or "unknown"
anchor_info = (
f"## 🎬 {anchor['name']}\n"
f"**Movie/Show:** {anchor['movie']} ({anchor.get('year', '')})\n"
f"**Genre:** {genres_str}\n\n"
f"_{anchor['description'][:200]}..._\n\n"
f"---\n\n"
)
# Search
results = search(query_text, top_k=int(top_k), exclude_id=exclude_id)
if not results:
return "No similar characters found. Try a different query."
output = anchor_info + "## πŸ”— Most Similar Characters\n\n"
for i, r in enumerate(results, 1):
bar = "β–ˆ" * round(r["similarity"] * 10) + "β–‘" * (10 - round(r["similarity"] * 10))
year = f" ({r['year']})" if r["year"] else ""
genre = f" Β· _{r['genre']}_" if r["genre"] else ""
output += (
f"### {i}. {r['name']}\n"
f"**{r['movie']}**{year}{genre}\n\n"
f"`{bar}` **{r['similarity']:.3f}**\n\n"
f"{r['description']}...\n\n"
f"---\n\n"
)
return output
# ══════════════════════════════════════════════════════════════════════════════
# GRADIO UI
# ══════════════════════════════════════════════════════════════════════════════
with gr.Blocks(
title="Character Similarity Engine",
theme=gr.themes.Soft(),
) as demo:
gr.Markdown("""
# 🎬 Character Similarity Engine
**Find the most similar characters across 34,000+ movies and shows.**
Powered by a fine-tuned `all-MiniLM-L6-v2` model (Spearman: 0.77) trained on 88,000 character descriptions.
""")
with gr.Row():
with gr.Column(scale=1):
char_input = gr.Textbox(
label="πŸ”Ž Character Name",
placeholder="e.g. Sherlock Holmes, Indiana Jones, Dracula...",
)
gr.Markdown("**or**")
desc_input = gr.Textbox(
label="πŸ“ Describe a character (free text)",
placeholder="A ruthless crime boss who started as a small-time thug...",
lines=4,
)
top_k_slider = gr.Slider(
minimum=3, maximum=10, value=5, step=1,
label="Number of results"
)
search_btn = gr.Button("Find Similar Characters πŸš€", variant="primary")
with gr.Column(scale=2):
output_md = gr.Markdown(value="Results will appear here...")
gr.Examples(
label="πŸ’‘ Try these examples",
examples=[
["Dracula", "", 5],
["Sherlock Holmes", "", 5],
["Indiana Jones", "", 5],
["", "A brilliant scientist who becomes a monster after a failed experiment", 5],
["", "A young orphan who discovers they have magical powers and must save the world", 5],
["", "A hardboiled detective in a corrupt city who plays by his own rules", 5],
["", "A charismatic villain obsessed with power and revenge", 5],
],
inputs=[char_input, desc_input, top_k_slider],
)
search_btn.click(
fn=handle_search,
inputs=[char_input, desc_input, top_k_slider],
outputs=output_md,
)
# Also trigger on Enter in text boxes
char_input.submit(fn=handle_search,
inputs=[char_input, desc_input, top_k_slider],
outputs=output_md)
desc_input.submit(fn=handle_search,
inputs=[char_input, desc_input, top_k_slider],
outputs=output_md)
gr.Markdown("""
---
*Built with sentence-transformers Β· FAISS Β· Gradio Β· Fine-tuned on Wikipedia movie plots*
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False, # set True for a temporary public URL
show_error=True,
)