Spaces:

programmersd
/

movie_nerd

No application file

App Files Files Community

Soumalya Das commited on Jan 3

Commit

342ab6d

verified ·

1 Parent(s): 795bb0e

Upload 4 files

Browse files

Files changed (4) hide show

README.txt +1 -0
api.py +88 -0
extract_tokenizer.py +34 -0
requirements.txt +4 -0

README.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Keras 3 Transformer Movie Recommender – max compatibility build

api.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import numpy as np
+import pandas as pd
+import json
+import pickle
+import io
+from sklearn.metrics.pairwise import cosine_similarity
+class MovieRecommender:
+    def __init__(self, model_path="."):
+        self.embeddings = np.load(f"{model_path}/embeddings.npy")
+        self.embeddings = np.nan_to_num(self.embeddings)
+        # Try loading from JSON first (preferred)
+        try:
+            with open(f"{model_path}/tokenizer_vocab.json", "r") as f:
+                self.tokenizer = json.load(f)
+        except FileNotFoundError:
+            # Fallback: extract vocab from pickle file using BytesIO
+            self.tokenizer = self._extract_vocab_from_pickle(f"{model_path}/tokenizer.pkl")
+            # Save as JSON for future use
+            with open(f"{model_path}/tokenizer_vocab.json", "w") as f:
+                json.dump(self.tokenizer, f)
+        self.movies = pd.read_json(f"{model_path}/movies.json")
+    def _extract_vocab_from_pickle(self, filepath):
+        """Extract vocabulary dictionary from pickle file by analyzing its structure"""
+        with open(filepath, "rb") as f:
+            pickle_data = f.read()
+        # Try to find dict-like structures in the pickle
+        try:
+            # Use pickletools to analyze and reconstruct
+            unpickler = pickle.Unpickler(io.BytesIO(pickle_data))
+            # Disable loading of classes that don't exist
+            unpickler.find_class = lambda module, name: dict
+            try:
+                result = unpickler.load()
+                if isinstance(result, dict):
+                    return result
+            except:
+                pass
+        except:
+            pass
+        # Fallback: scan for dictionary patterns in pickle bytecode
+        try:
+            memo = {}
+            stack = []
+            # Read pickle opcodes manually
+            import pickletools
+            ops = []
+            for opcode, arg, pos in pickletools.genops(pickle_data):
+                ops.append((opcode.name, arg))
+            # Look for dictionary-like structures
+            for i, (op, arg) in enumerate(ops):
+                if op == 'EMPTY_DICT' or op == 'DICT':
+                    # Found a dict operation
+                    try:
+                        # Try to reconstruct from this point
+                        subset = pickle_data[:pos+10] # pyright: ignore[reportOptionalOperand]
+                        test_unpickler = pickle.Unpickler(io.BytesIO(subset))
+                        test_unpickler.find_class = lambda m, n: None
+                    except:
+                        pass
+        except:
+            pass
+        # Final fallback: return empty dict
+        print("Warning: Could not extract vocabulary from pickle. Using empty tokenizer.")
+        print("Recommendation quality will be limited.")
+        return {}
+    def _encode(self, prompt):
+        tokens = prompt.lower().split()[:32]
+        ids = [self.tokenizer.get(t, 0) for t in tokens]
+        ids = [i if i < len(self.embeddings) else 0 for i in ids]
+        return np.array(ids)[None,:]
+    def recommend(self, prompt, topk=10):
+        q_ids = self._encode(prompt)
+        query_vec = np.sum(self.embeddings[q_ids], axis=1)
+        sims = cosine_similarity(query_vec, self.embeddings).flatten()
+        idx = sims.argsort()[::-1][:topk]
+        return self.movies.iloc[idx][["title","release_date","vote_average","vote_count","status"]]

extract_tokenizer.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import pickle
+import json
+import sys
+import string
+class SimpleTokenizer:
+    def __init__(self, vocab=None):
+        self.vocab = vocab or {}
+def is_clean_token(t):
+    return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD")
+try:
+    with open("tokenizer.pkl", "rb") as f:
+        tokenizer_obj = pickle.load(f)
+    vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj
+    clean_vocab = {
+        k: v for k, v in vocab.items()
+        if is_clean_token(k)
+    }
+    with open("tokenizer_vocab.json", "w", encoding="utf-8") as f:
+        json.dump(clean_vocab, f, indent=2, ensure_ascii=True)
+    print("✓ Clean vocab extracted")
+    print(f"✓ Original size: {len(vocab)}")
+    print(f"✓ Clean size: {len(clean_vocab)}")
+except Exception as e:
+    print(f"✗ Error: {e}")
+    sys.exit(1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+numpy
+pandas
+scikit-learn
+gradio