Abhishek7356 commited on
Commit
6fcb6d3
Β·
1 Parent(s): dadaf42

updated code

Browse files
Files changed (4) hide show
  1. .gitignore +3 -1
  2. app.py +47 -32
  3. build_index.py +27 -42
  4. categories_meta.pkl +3 -0
.gitignore CHANGED
@@ -1 +1,3 @@
1
- .env
 
 
 
1
+ .env
2
+ categories.json
3
+ categories.index
app.py CHANGED
@@ -1,28 +1,27 @@
1
  from flask import Flask, request, jsonify, render_template
2
- from sentence_transformers import SentenceTransformer
3
  import faiss, numpy as np, pickle, os
4
  from openai import OpenAI
5
  from dotenv import load_dotenv
6
- from build_index import build_faiss_index # <-- import your function
7
 
8
  load_dotenv()
9
  app = Flask(__name__)
10
 
 
 
11
  INDEX_PATH = "categories.index"
12
  META_PATH = "categories_meta.pkl"
13
 
14
- # Automatically build the index if missing
15
- if not (os.path.exists(INDEX_PATH) and os.path.exists(META_PATH)):
16
- print("⚠️ Index not found β€” building now...")
17
- build_faiss_index()
18
-
19
- # Load model, index, metadata
20
- model = SentenceTransformer("all-MiniLM-L6-v2")
21
  index = faiss.read_index(INDEX_PATH)
22
  with open(META_PATH, "rb") as f:
23
  categories = pickle.load(f)
24
 
25
- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 
 
 
 
26
 
27
  def ask_openai(product_text, candidates):
28
  prompt = f"""
@@ -31,43 +30,54 @@ You are a product category classifier.
31
  Product details:
32
  {product_text}
33
 
34
- Here are the most relevant candidate categories:
35
  {chr(10).join([f"- {c}" for c in candidates])}
36
 
37
- Choose the single most accurate category path from the list above.
38
- Return ONLY the exact category path text.
39
  """
40
- response = client.chat.completions.create(
41
  model="gpt-4o-mini",
42
- messages=[{"role": "user", "content": prompt}],
43
  temperature=0.2,
 
44
  )
45
- return response.choices[0].message.content.strip()
46
-
47
- @app.route("/")
48
- def home():
49
- return render_template("index.html")
50
 
51
  @app.route("/predict-category", methods=["POST"])
52
  def predict_category():
53
  data = request.json
 
54
  title = data.get("title", "")
55
  description = data.get("description", "")
56
- text = f"{title}. {description}"
57
-
58
- # Embed query
59
- query_vec = model.encode([text], convert_to_numpy=True)
60
-
61
- # Search FAISS index
62
- D, I = index.search(np.array(query_vec, dtype=np.float32), 10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  candidates = [categories[i]["category_path"] for i in I[0]]
64
-
65
- # Compute confidence
66
  sims = 1 / (1 + D[0])
67
  confidences = (sims - sims.min()) / (sims.max() - sims.min() + 1e-9)
68
- confidence = float(confidences[0])
69
 
70
- # Get GPT refinement
71
  final_category = ask_openai(text, candidates)
72
 
73
  return jsonify({
@@ -76,8 +86,13 @@ def predict_category():
76
  {"category": c, "confidence": float(conf)}
77
  for c, conf in zip(candidates[:5], confidences[:5])
78
  ],
79
- "final_confidence": round(confidence, 3)
80
  })
81
 
 
 
 
 
 
82
  if __name__ == "__main__":
83
  app.run(debug=True)
 
1
  from flask import Flask, request, jsonify, render_template
 
2
  import faiss, numpy as np, pickle, os
3
  from openai import OpenAI
4
  from dotenv import load_dotenv
 
5
 
6
  load_dotenv()
7
  app = Flask(__name__)
8
 
9
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
10
+
11
  INDEX_PATH = "categories.index"
12
  META_PATH = "categories_meta.pkl"
13
 
14
+ # Load FAISS + metadata
 
 
 
 
 
 
15
  index = faiss.read_index(INDEX_PATH)
16
  with open(META_PATH, "rb") as f:
17
  categories = pickle.load(f)
18
 
19
+ def embed_query(text, model="text-embedding-3-large"):
20
+ resp = client.embeddings.create(
21
+ model=model,
22
+ input=text
23
+ )
24
+ return np.array(resp.data[0].embedding, dtype="float32").reshape(1, -1)
25
 
26
  def ask_openai(product_text, candidates):
27
  prompt = f"""
 
30
  Product details:
31
  {product_text}
32
 
33
+ Candidate categories:
34
  {chr(10).join([f"- {c}" for c in candidates])}
35
 
36
+ Choose the single best category.
37
+ Return ONLY the exact category path.
38
  """
39
+ resp = client.chat.completions.create(
40
  model="gpt-4o-mini",
 
41
  temperature=0.2,
42
+ messages=[{"role": "user", "content": prompt}]
43
  )
44
+ return resp.choices[0].message.content.strip()
 
 
 
 
45
 
46
  @app.route("/predict-category", methods=["POST"])
47
  def predict_category():
48
  data = request.json
49
+
50
  title = data.get("title", "")
51
  description = data.get("description", "")
52
+ product_type = data.get("product_type", "")
53
+ tags = data.get("tags", [])
54
+
55
+ # Handle tags if passed as a list OR comma-separated string
56
+ if isinstance(tags, list):
57
+ tags_text = ", ".join(tags)
58
+ else:
59
+ tags_text = str(tags)
60
+
61
+ # Combine everything into a single rich text block
62
+ text = f"""
63
+ Title: {title}
64
+ Description: {description}
65
+ Product Type: {product_type}
66
+ Tags: {tags_text}
67
+ """.strip()
68
+
69
+ # Embed using OpenAI
70
+ query_vec = embed_query(text)
71
+
72
+ # Search FAISS
73
+ D, I = index.search(query_vec, 10)
74
  candidates = [categories[i]["category_path"] for i in I[0]]
75
+ print(candidates)
76
+ # Confidence scoring
77
  sims = 1 / (1 + D[0])
78
  confidences = (sims - sims.min()) / (sims.max() - sims.min() + 1e-9)
 
79
 
80
+ # GPT refinement
81
  final_category = ask_openai(text, candidates)
82
 
83
  return jsonify({
 
86
  {"category": c, "confidence": float(conf)}
87
  for c, conf in zip(candidates[:5], confidences[:5])
88
  ],
89
+ "final_confidence": float(confidences[0])
90
  })
91
 
92
+ @app.route("/")
93
+ def home():
94
+ return render_template("index.html")
95
+
96
+
97
  if __name__ == "__main__":
98
  app.run(debug=True)
build_index.py CHANGED
@@ -1,76 +1,61 @@
1
  import json
2
  import numpy as np
3
  import faiss
4
- from sentence_transformers import SentenceTransformer
5
  import pickle
6
- from tqdm import tqdm
7
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def build_faiss_index(
10
  json_path="categories.json",
11
  index_path="categories.index",
12
  meta_path="categories_meta.pkl",
13
- model_name="all-MiniLM-L6-v2",
14
- batch_size=256
15
  ):
16
- """
17
- Builds a FAISS index from a category JSON file.
18
-
19
- Args:
20
- json_path (str): Path to the JSON file containing category data.
21
- index_path (str): Path to save the FAISS index file.
22
- meta_path (str): Path to save the metadata pickle file.
23
- model_name (str): SentenceTransformer model name.
24
- batch_size (int): Batch size for encoding.
25
-
26
- Returns:
27
- tuple: (index_path, meta_path)
28
- """
29
- print("πŸ”§ Building FAISS index...")
30
-
31
- # Load categories
32
  if not os.path.exists(json_path):
33
- raise FileNotFoundError(f"❌ JSON file not found: {json_path}")
 
34
  with open(json_path, "r", encoding="utf-8") as f:
35
  categories = json.load(f)
36
 
37
  texts = [c["category_path"] for c in categories]
38
  print(f"πŸ“¦ Total categories: {len(texts)}")
39
 
40
- # Load model
41
- model = SentenceTransformer(model_name)
42
- embeddings = []
43
-
44
- total_batches = (len(texts) + batch_size - 1) // batch_size
45
- print(f"βš™οΈ Encoding in {total_batches} batches...")
46
-
47
- for i in range(0, len(texts), batch_size):
48
- current_batch = i // batch_size + 1
49
- print(f"πŸš€ Batch {current_batch}/{total_batches}")
50
- batch = texts[i:i + batch_size]
51
- batch_embeddings = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
52
- embeddings.append(batch_embeddings)
53
-
54
- embeddings = np.vstack(embeddings)
55
 
56
  # Build FAISS index
57
  dim = embeddings.shape[1]
58
  index = faiss.IndexFlatL2(dim)
59
  index.add(embeddings)
60
 
61
- # Save index + metadata
62
  faiss.write_index(index, index_path)
 
63
  with open(meta_path, "wb") as f:
64
  pickle.dump(categories, f)
65
 
66
- print(f"\nβœ… FAISS index built successfully!")
67
- print(f"πŸ“ Index saved at: {index_path}")
68
- print(f"πŸ“ Metadata saved at: {meta_path}")
69
- print(f"πŸ“Š Total categories: {len(categories)} | Embedding dimension: {dim}")
70
 
71
  return index_path, meta_path
72
 
73
 
74
- # Optional: run directly to build index
75
  if __name__ == "__main__":
76
  build_faiss_index()
 
1
  import json
2
  import numpy as np
3
  import faiss
 
4
  import pickle
 
5
  import os
6
+ from tqdm import tqdm
7
+ from openai import OpenAI
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
12
+
13
+ def embed_text(texts, model="text-embedding-3-large"):
14
+ embeddings = []
15
+ for t in tqdm(texts, desc="Creating OpenAI embeddings"):
16
+ resp = client.embeddings.create(
17
+ model=model,
18
+ input=t
19
+ )
20
+ embeddings.append(resp.data[0].embedding)
21
+ return np.array(embeddings).astype("float32")
22
+
23
 
24
  def build_faiss_index(
25
  json_path="categories.json",
26
  index_path="categories.index",
27
  meta_path="categories_meta.pkl",
 
 
28
  ):
29
+ print("πŸ”§ Loading categories...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  if not os.path.exists(json_path):
31
+ raise FileNotFoundError(f"❌ JSON missing: {json_path}")
32
+
33
  with open(json_path, "r", encoding="utf-8") as f:
34
  categories = json.load(f)
35
 
36
  texts = [c["category_path"] for c in categories]
37
  print(f"πŸ“¦ Total categories: {len(texts)}")
38
 
39
+ print("βš™οΈ Creating OpenAI embeddings…")
40
+ embeddings = embed_text(texts)
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Build FAISS index
43
  dim = embeddings.shape[1]
44
  index = faiss.IndexFlatL2(dim)
45
  index.add(embeddings)
46
 
 
47
  faiss.write_index(index, index_path)
48
+
49
  with open(meta_path, "wb") as f:
50
  pickle.dump(categories, f)
51
 
52
+ print("\nβœ… OpenAI + FAISS index created!")
53
+ print(f"πŸ“ Index: {index_path}")
54
+ print(f"πŸ“ Metadata: {meta_path}")
55
+ print(f"πŸ“Š Embedding dim: {dim}")
56
 
57
  return index_path, meta_path
58
 
59
 
 
60
  if __name__ == "__main__":
61
  build_faiss_index()
categories_meta.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:058adfdee542ef1ef62b6da38726bf878321ad5e5e13bad7483da65aa93ef33d
3
+ size 3111090