Soumalya Das commited on
Commit
342ab6d
Β·
verified Β·
1 Parent(s): 795bb0e

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.txt +1 -0
  2. api.py +88 -0
  3. extract_tokenizer.py +34 -0
  4. requirements.txt +4 -0
README.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Keras 3 Transformer Movie Recommender – max compatibility build
api.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import json
4
+ import pickle
5
+ import io
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ class MovieRecommender:
9
+ def __init__(self, model_path="."):
10
+ self.embeddings = np.load(f"{model_path}/embeddings.npy")
11
+ self.embeddings = np.nan_to_num(self.embeddings)
12
+
13
+ # Try loading from JSON first (preferred)
14
+ try:
15
+ with open(f"{model_path}/tokenizer_vocab.json", "r") as f:
16
+ self.tokenizer = json.load(f)
17
+ except FileNotFoundError:
18
+ # Fallback: extract vocab from pickle file using BytesIO
19
+ self.tokenizer = self._extract_vocab_from_pickle(f"{model_path}/tokenizer.pkl")
20
+ # Save as JSON for future use
21
+ with open(f"{model_path}/tokenizer_vocab.json", "w") as f:
22
+ json.dump(self.tokenizer, f)
23
+
24
+ self.movies = pd.read_json(f"{model_path}/movies.json")
25
+
26
+ def _extract_vocab_from_pickle(self, filepath):
27
+ """Extract vocabulary dictionary from pickle file by analyzing its structure"""
28
+ with open(filepath, "rb") as f:
29
+ pickle_data = f.read()
30
+
31
+ # Try to find dict-like structures in the pickle
32
+ try:
33
+ # Use pickletools to analyze and reconstruct
34
+ unpickler = pickle.Unpickler(io.BytesIO(pickle_data))
35
+ # Disable loading of classes that don't exist
36
+ unpickler.find_class = lambda module, name: dict
37
+ try:
38
+ result = unpickler.load()
39
+ if isinstance(result, dict):
40
+ return result
41
+ except:
42
+ pass
43
+ except:
44
+ pass
45
+
46
+ # Fallback: scan for dictionary patterns in pickle bytecode
47
+ try:
48
+ memo = {}
49
+ stack = []
50
+
51
+ # Read pickle opcodes manually
52
+ import pickletools
53
+ ops = []
54
+ for opcode, arg, pos in pickletools.genops(pickle_data):
55
+ ops.append((opcode.name, arg))
56
+
57
+ # Look for dictionary-like structures
58
+ for i, (op, arg) in enumerate(ops):
59
+ if op == 'EMPTY_DICT' or op == 'DICT':
60
+ # Found a dict operation
61
+ try:
62
+ # Try to reconstruct from this point
63
+ subset = pickle_data[:pos+10] # pyright: ignore[reportOptionalOperand]
64
+ test_unpickler = pickle.Unpickler(io.BytesIO(subset))
65
+ test_unpickler.find_class = lambda m, n: None
66
+ except:
67
+ pass
68
+ except:
69
+ pass
70
+
71
+ # Final fallback: return empty dict
72
+ print("Warning: Could not extract vocabulary from pickle. Using empty tokenizer.")
73
+ print("Recommendation quality will be limited.")
74
+ return {}
75
+
76
+ def _encode(self, prompt):
77
+ tokens = prompt.lower().split()[:32]
78
+ ids = [self.tokenizer.get(t, 0) for t in tokens]
79
+ ids = [i if i < len(self.embeddings) else 0 for i in ids]
80
+ return np.array(ids)[None,:]
81
+
82
+ def recommend(self, prompt, topk=10):
83
+ q_ids = self._encode(prompt)
84
+ query_vec = np.sum(self.embeddings[q_ids], axis=1)
85
+ sims = cosine_similarity(query_vec, self.embeddings).flatten()
86
+ idx = sims.argsort()[::-1][:topk]
87
+ return self.movies.iloc[idx][["title","release_date","vote_average","vote_count","status"]]
88
+
extract_tokenizer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import json
3
+ import sys
4
+ import string
5
+
6
+ class SimpleTokenizer:
7
+ def __init__(self, vocab=None):
8
+ self.vocab = vocab or {}
9
+
10
+ def is_clean_token(t):
11
+ return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD")
12
+
13
+ try:
14
+ with open("tokenizer.pkl", "rb") as f:
15
+ tokenizer_obj = pickle.load(f)
16
+
17
+ vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj
18
+
19
+ clean_vocab = {
20
+ k: v for k, v in vocab.items()
21
+ if is_clean_token(k)
22
+ }
23
+
24
+ with open("tokenizer_vocab.json", "w", encoding="utf-8") as f:
25
+ json.dump(clean_vocab, f, indent=2, ensure_ascii=True)
26
+
27
+ print("βœ“ Clean vocab extracted")
28
+ print(f"βœ“ Original size: {len(vocab)}")
29
+ print(f"βœ“ Clean size: {len(clean_vocab)}")
30
+
31
+ except Exception as e:
32
+ print(f"βœ— Error: {e}")
33
+ sys.exit(1)
34
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ gradio