Soumalya Das commited on
Commit
b0986f4
·
verified ·
1 Parent(s): 2155e0d

Upload folder using huggingface_hub

Browse files
Files changed (12) hide show
  1. .gitattributes +4 -35
  2. .gitignore +2 -0
  3. README.txt +1 -0
  4. api.py +88 -0
  5. app.py +23 -0
  6. embeddings.npy +3 -0
  7. extract_tokenizer.py +34 -0
  8. movies.json +3 -0
  9. requirements.txt +4 -0
  10. tf_model.keras +3 -0
  11. tokenizer.pkl +3 -0
  12. tokenizer_vocab.json +3 -0
.gitattributes CHANGED
@@ -1,35 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.npy filter=lfs diff=lfs merge=lfs -text
2
+ *.json filter=lfs diff=lfs merge=lfs -text
3
+ *.keras filter=lfs diff=lfs merge=lfs -text
4
+ *.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ **/__pycache__/
2
+ temp/
README.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Keras 3 Transformer Movie Recommender – max compatibility build
api.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import json
4
+ import pickle
5
+ import io
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ class MovieRecommender:
9
+ def __init__(self, model_path="."):
10
+ self.embeddings = np.load(f"{model_path}/embeddings.npy")
11
+ self.embeddings = np.nan_to_num(self.embeddings)
12
+
13
+ # Try loading from JSON first (preferred)
14
+ try:
15
+ with open(f"{model_path}/tokenizer_vocab.json", "r") as f:
16
+ self.tokenizer = json.load(f)
17
+ except FileNotFoundError:
18
+ # Fallback: extract vocab from pickle file using BytesIO
19
+ self.tokenizer = self._extract_vocab_from_pickle(f"{model_path}/tokenizer.pkl")
20
+ # Save as JSON for future use
21
+ with open(f"{model_path}/tokenizer_vocab.json", "w") as f:
22
+ json.dump(self.tokenizer, f)
23
+
24
+ self.movies = pd.read_json(f"{model_path}/movies.json")
25
+
26
+ def _extract_vocab_from_pickle(self, filepath):
27
+ """Extract vocabulary dictionary from pickle file by analyzing its structure"""
28
+ with open(filepath, "rb") as f:
29
+ pickle_data = f.read()
30
+
31
+ # Try to find dict-like structures in the pickle
32
+ try:
33
+ # Use pickletools to analyze and reconstruct
34
+ unpickler = pickle.Unpickler(io.BytesIO(pickle_data))
35
+ # Disable loading of classes that don't exist
36
+ unpickler.find_class = lambda module, name: dict
37
+ try:
38
+ result = unpickler.load()
39
+ if isinstance(result, dict):
40
+ return result
41
+ except:
42
+ pass
43
+ except:
44
+ pass
45
+
46
+ # Fallback: scan for dictionary patterns in pickle bytecode
47
+ try:
48
+ memo = {}
49
+ stack = []
50
+
51
+ # Read pickle opcodes manually
52
+ import pickletools
53
+ ops = []
54
+ for opcode, arg, pos in pickletools.genops(pickle_data):
55
+ ops.append((opcode.name, arg))
56
+
57
+ # Look for dictionary-like structures
58
+ for i, (op, arg) in enumerate(ops):
59
+ if op == 'EMPTY_DICT' or op == 'DICT':
60
+ # Found a dict operation
61
+ try:
62
+ # Try to reconstruct from this point
63
+ subset = pickle_data[:pos+10] # pyright: ignore[reportOptionalOperand]
64
+ test_unpickler = pickle.Unpickler(io.BytesIO(subset))
65
+ test_unpickler.find_class = lambda m, n: None
66
+ except:
67
+ pass
68
+ except:
69
+ pass
70
+
71
+ # Final fallback: return empty dict
72
+ print("Warning: Could not extract vocabulary from pickle. Using empty tokenizer.")
73
+ print("Recommendation quality will be limited.")
74
+ return {}
75
+
76
+ def _encode(self, prompt):
77
+ tokens = prompt.lower().split()[:32]
78
+ ids = [self.tokenizer.get(t, 0) for t in tokens]
79
+ ids = [i if i < len(self.embeddings) else 0 for i in ids]
80
+ return np.array(ids)[None,:]
81
+
82
+ def recommend(self, prompt, topk=10):
83
+ q_ids = self._encode(prompt)
84
+ query_vec = np.sum(self.embeddings[q_ids], axis=1)
85
+ sims = cosine_similarity(query_vec, self.embeddings).flatten()
86
+ idx = sims.argsort()[::-1][:topk]
87
+ return self.movies.iloc[idx][["title","release_date","vote_average","vote_count","status"]]
88
+
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from api import MovieRecommender
3
+
4
+ recommender = MovieRecommender()
5
+
6
+ def recommend_movies(prompt, topk):
7
+ df = recommender.recommend(prompt, topk=int(topk))
8
+ return df
9
+
10
+ demo = gr.Interface(
11
+ fn=recommend_movies,
12
+ inputs=[
13
+ gr.Textbox(label="Movie prompt", placeholder="action thriller with robots"),
14
+ gr.Slider(1, 20, value=5, step=1, label="Top K")
15
+ ],
16
+ outputs=gr.Dataframe(label="Recommendations"),
17
+ title="🎬 Movie Nerd",
18
+ description="Prompt-based movie recommendations using embeddings"
19
+ )
20
+
21
+ if __name__ == "__main__":
22
+ demo.launch()
23
+
embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80d481b4fb2494b02d67e19ab6e6da0ea5a2d1529629734ca3b48b05074904ae
3
+ size 586438272
extract_tokenizer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import json
3
+ import sys
4
+ import string
5
+
6
+ class SimpleTokenizer:
7
+ def __init__(self, vocab=None):
8
+ self.vocab = vocab or {}
9
+
10
+ def is_clean_token(t):
11
+ return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD")
12
+
13
+ try:
14
+ with open("tokenizer.pkl", "rb") as f:
15
+ tokenizer_obj = pickle.load(f)
16
+
17
+ vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj
18
+
19
+ clean_vocab = {
20
+ k: v for k, v in vocab.items()
21
+ if is_clean_token(k)
22
+ }
23
+
24
+ with open("tokenizer_vocab.json", "w", encoding="utf-8") as f:
25
+ json.dump(clean_vocab, f, indent=2, ensure_ascii=True)
26
+
27
+ print("✓ Clean vocab extracted")
28
+ print(f"✓ Original size: {len(vocab)}")
29
+ print(f"✓ Clean size: {len(clean_vocab)}")
30
+
31
+ except Exception as e:
32
+ print(f"✗ Error: {e}")
33
+ sys.exit(1)
34
+
movies.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cb113e2ac436edd0b2809022e4d4f087660d002f899a124191db209d5d1228a
3
+ size 1452912008
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ gradio
tf_model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d34b163498fd09e0842c58a9a0cb70825665217d25a89c8767899ee35f030bb
3
+ size 532345057
tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dce99c3a78b1c97f4cd0e2202bf078c76c0bb7b08e2985dc2c55779839d36c75
3
+ size 17203306
tokenizer_vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de33a8fc638dbf6f9916d4046150c15a06c1aa7f6cbebd6e5a10b89becb3bc31
3
+ size 25721622