| """ |
| Character Similarity Engine β Deployment App |
| ============================================= |
| Gradio UI + FAISS vector search in a single file. |
| |
| Local run: |
| pip install gradio faiss-cpu sentence-transformers |
| python deployment/app.py |
| |
| HuggingFace Spaces: |
| Upload this file + requirements.txt to a Gradio Space. |
| Set MODEL_SOURCE = "hub" and push your model to HuggingFace Hub first. |
| """ |
|
|
| import json, ast |
| import numpy as np |
| from pathlib import Path |
| import gradio as gr |
| import faiss |
| from sentence_transformers import SentenceTransformer |
|
|
| |
| import os |
| ON_SPACES = os.getenv("SPACE_ID") is not None |
|
|
| if ON_SPACES: |
| MODEL_DIR = "Phalakk/charsim-model" |
| PROC_DIR = Path(__file__).resolve().parent |
| INDEX_DIR = Path(__file__).resolve().parent |
| else: |
| _ROOT = Path(__file__).resolve().parent.parent |
| MODEL_DIR = _ROOT / "models" / "charsim_final" |
| PROC_DIR = _ROOT / "data" / "processed" |
| INDEX_DIR = Path(__file__).resolve().parent |
|
|
| INDEX_PATH = INDEX_DIR / "char_index.faiss" |
| META_PATH = INDEX_DIR / "char_meta.jsonl" |
|
|
|
|
| |
| |
| |
|
|
| def parse_genre(g): |
| if isinstance(g, list): return g |
| try: |
| return ast.literal_eval(g) |
| except: |
| return [g] if g else [] |
|
|
|
|
| print("=" * 50) |
| print(" Character Similarity Engine β Starting up") |
| print("=" * 50) |
|
|
| |
| print(f"\n[1/3] Loading model from {MODEL_DIR}...") |
|
|
| model = SentenceTransformer(MODEL_DIR if ON_SPACES else str(MODEL_DIR)) |
| print(" Model loaded.") |
|
|
| |
| print(f"[2/3] Loading characters from chars.jsonl...") |
| characters = [] |
| with open(PROC_DIR / "chars.jsonl") as f: |
| for line in f: |
| c = json.loads(line) |
| if c["name"].startswith("[Story]"): |
| continue |
| if len(c.get("description", "")) < 60: |
| continue |
| c["genre_parsed"] = parse_genre(c.get("genre", [])) |
| characters.append(c) |
| print(f" {len(characters):,} characters loaded.") |
|
|
| |
| if INDEX_PATH.exists(): |
| print(f"[3/3] Loading FAISS index from {INDEX_PATH}...") |
| index = faiss.read_index(str(INDEX_PATH)) |
| print(f" Index loaded: {index.ntotal:,} vectors.") |
| else: |
| print(f"[3/3] Building FAISS index ({len(characters):,} characters)...") |
| print(" This takes ~5 min on first run, then saves for reuse.") |
| descriptions = [c["description"] for c in characters] |
| embeddings = model.encode( |
| descriptions, |
| batch_size=64, |
| show_progress_bar=True, |
| normalize_embeddings=True, |
| ) |
| dim = embeddings.shape[1] |
| index = faiss.IndexFlatIP(dim) |
| index.add(embeddings.astype(np.float32)) |
|
|
| faiss.write_index(index, str(INDEX_PATH)) |
| |
| with open(META_PATH, "w") as f: |
| for c in characters: |
| f.write(json.dumps(c) + "\n") |
| print(f" Index saved to {INDEX_PATH}") |
|
|
| |
| name_index = {} |
| for i, c in enumerate(characters): |
| name_lower = c["name"].lower() |
| if name_lower not in name_index: |
| name_index[name_lower] = i |
| |
| first = c["name"].split()[0].lower() |
| if first not in name_index: |
| name_index[first] = i |
|
|
| print("\n Ready! Starting Gradio UI...\n") |
|
|
|
|
| |
| |
| |
|
|
| def search(query_text: str, top_k: int = 6, exclude_id: str = None): |
| """ |
| Encode query β FAISS search β return top_k results. |
| Excludes the query character itself if exclude_id is set. |
| """ |
| emb = model.encode( |
| [query_text], |
| normalize_embeddings=True, |
| convert_to_numpy=True |
| ).astype(np.float32) |
|
|
| scores, indices = index.search(emb, top_k + 5) |
|
|
| results = [] |
| for score, idx in zip(scores[0], indices[0]): |
| if idx < 0 or idx >= len(characters): |
| continue |
| c = characters[idx] |
| if exclude_id and c["id"] == exclude_id: |
| continue |
| results.append({ |
| "name": c["name"], |
| "movie": c["movie"], |
| "year": c.get("year", ""), |
| "genre": ", ".join(c["genre_parsed"][:2]), |
| "similarity": round(float(score), 3), |
| "description": c["description"][:250], |
| "id": c["id"], |
| }) |
| if len(results) == top_k: |
| break |
| return results |
|
|
|
|
| |
| |
| |
|
|
| def handle_search(character_name: str, custom_desc: str, top_k: int) -> str: |
| character_name = character_name.strip() |
| custom_desc = custom_desc.strip() |
|
|
| if not character_name and not custom_desc: |
| return "β οΈ Please enter a character name or a description." |
|
|
| anchor_info = "" |
| exclude_id = None |
|
|
| |
| if custom_desc: |
| query_text = custom_desc |
| anchor_info = f"π **Query description:** _{query_text[:120]}..._\n\n" |
| else: |
| |
| name_key = character_name.lower() |
| first_key = character_name.split()[0].lower() |
| char_idx = name_index.get(name_key) or name_index.get(first_key) |
|
|
| if char_idx is None: |
| |
| matches = [c for c in characters |
| if character_name.lower() in c["name"].lower()] |
| if not matches: |
| return (f"β **'{character_name}' not found** in the database.\n\n" |
| f"Try using the description box instead β describe the " |
| f"character in your own words.") |
| char_idx = characters.index(matches[0]) |
|
|
| anchor = characters[char_idx] |
| query_text = anchor["description"] |
| exclude_id = anchor["id"] |
| genres_str = ", ".join(anchor["genre_parsed"][:3]) or "unknown" |
| anchor_info = ( |
| f"## π¬ {anchor['name']}\n" |
| f"**Movie/Show:** {anchor['movie']} ({anchor.get('year', '')})\n" |
| f"**Genre:** {genres_str}\n\n" |
| f"_{anchor['description'][:200]}..._\n\n" |
| f"---\n\n" |
| ) |
|
|
| |
| results = search(query_text, top_k=int(top_k), exclude_id=exclude_id) |
|
|
| if not results: |
| return "No similar characters found. Try a different query." |
|
|
| output = anchor_info + "## π Most Similar Characters\n\n" |
|
|
| for i, r in enumerate(results, 1): |
| bar = "β" * round(r["similarity"] * 10) + "β" * (10 - round(r["similarity"] * 10)) |
| year = f" ({r['year']})" if r["year"] else "" |
| genre = f" Β· _{r['genre']}_" if r["genre"] else "" |
| output += ( |
| f"### {i}. {r['name']}\n" |
| f"**{r['movie']}**{year}{genre}\n\n" |
| f"`{bar}` **{r['similarity']:.3f}**\n\n" |
| f"{r['description']}...\n\n" |
| f"---\n\n" |
| ) |
|
|
| return output |
|
|
|
|
| |
| |
| |
|
|
| with gr.Blocks( |
| title="Character Similarity Engine", |
| theme=gr.themes.Soft(), |
| ) as demo: |
| gr.Markdown(""" |
| # π¬ Character Similarity Engine |
| **Find the most similar characters across 34,000+ movies and shows.** |
| |
| Powered by a fine-tuned `all-MiniLM-L6-v2` model (Spearman: 0.77) trained on 88,000 character descriptions. |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| char_input = gr.Textbox( |
| label="π Character Name", |
| placeholder="e.g. Sherlock Holmes, Indiana Jones, Dracula...", |
| ) |
| gr.Markdown("**or**") |
| desc_input = gr.Textbox( |
| label="π Describe a character (free text)", |
| placeholder="A ruthless crime boss who started as a small-time thug...", |
| lines=4, |
| ) |
| top_k_slider = gr.Slider( |
| minimum=3, maximum=10, value=5, step=1, |
| label="Number of results" |
| ) |
| search_btn = gr.Button("Find Similar Characters π", variant="primary") |
|
|
| with gr.Column(scale=2): |
| output_md = gr.Markdown(value="Results will appear here...") |
|
|
| gr.Examples( |
| label="π‘ Try these examples", |
| examples=[ |
| ["Dracula", "", 5], |
| ["Sherlock Holmes", "", 5], |
| ["Indiana Jones", "", 5], |
| ["", "A brilliant scientist who becomes a monster after a failed experiment", 5], |
| ["", "A young orphan who discovers they have magical powers and must save the world", 5], |
| ["", "A hardboiled detective in a corrupt city who plays by his own rules", 5], |
| ["", "A charismatic villain obsessed with power and revenge", 5], |
| ], |
| inputs=[char_input, desc_input, top_k_slider], |
| ) |
|
|
| search_btn.click( |
| fn=handle_search, |
| inputs=[char_input, desc_input, top_k_slider], |
| outputs=output_md, |
| ) |
| |
| char_input.submit(fn=handle_search, |
| inputs=[char_input, desc_input, top_k_slider], |
| outputs=output_md) |
| desc_input.submit(fn=handle_search, |
| inputs=[char_input, desc_input, top_k_slider], |
| outputs=output_md) |
|
|
| gr.Markdown(""" |
| --- |
| *Built with sentence-transformers Β· FAISS Β· Gradio Β· Fine-tuned on Wikipedia movie plots* |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=False, |
| show_error=True, |
| ) |