Abhishek7356 commited on
Commit Β·
6fcb6d3
1
Parent(s): dadaf42
updated code
Browse files- .gitignore +3 -1
- app.py +47 -32
- build_index.py +27 -42
- categories_meta.pkl +3 -0
.gitignore
CHANGED
|
@@ -1 +1,3 @@
|
|
| 1 |
-
.env
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
categories.json
|
| 3 |
+
categories.index
|
app.py
CHANGED
|
@@ -1,28 +1,27 @@
|
|
| 1 |
from flask import Flask, request, jsonify, render_template
|
| 2 |
-
from sentence_transformers import SentenceTransformer
|
| 3 |
import faiss, numpy as np, pickle, os
|
| 4 |
from openai import OpenAI
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
-
from build_index import build_faiss_index # <-- import your function
|
| 7 |
|
| 8 |
load_dotenv()
|
| 9 |
app = Flask(__name__)
|
| 10 |
|
|
|
|
|
|
|
| 11 |
INDEX_PATH = "categories.index"
|
| 12 |
META_PATH = "categories_meta.pkl"
|
| 13 |
|
| 14 |
-
#
|
| 15 |
-
if not (os.path.exists(INDEX_PATH) and os.path.exists(META_PATH)):
|
| 16 |
-
print("β οΈ Index not found β building now...")
|
| 17 |
-
build_faiss_index()
|
| 18 |
-
|
| 19 |
-
# Load model, index, metadata
|
| 20 |
-
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 21 |
index = faiss.read_index(INDEX_PATH)
|
| 22 |
with open(META_PATH, "rb") as f:
|
| 23 |
categories = pickle.load(f)
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def ask_openai(product_text, candidates):
|
| 28 |
prompt = f"""
|
|
@@ -31,43 +30,54 @@ You are a product category classifier.
|
|
| 31 |
Product details:
|
| 32 |
{product_text}
|
| 33 |
|
| 34 |
-
|
| 35 |
{chr(10).join([f"- {c}" for c in candidates])}
|
| 36 |
|
| 37 |
-
Choose the single
|
| 38 |
-
Return ONLY the exact category path
|
| 39 |
"""
|
| 40 |
-
|
| 41 |
model="gpt-4o-mini",
|
| 42 |
-
messages=[{"role": "user", "content": prompt}],
|
| 43 |
temperature=0.2,
|
|
|
|
| 44 |
)
|
| 45 |
-
return
|
| 46 |
-
|
| 47 |
-
@app.route("/")
|
| 48 |
-
def home():
|
| 49 |
-
return render_template("index.html")
|
| 50 |
|
| 51 |
@app.route("/predict-category", methods=["POST"])
|
| 52 |
def predict_category():
|
| 53 |
data = request.json
|
|
|
|
| 54 |
title = data.get("title", "")
|
| 55 |
description = data.get("description", "")
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
candidates = [categories[i]["category_path"] for i in I[0]]
|
| 64 |
-
|
| 65 |
-
#
|
| 66 |
sims = 1 / (1 + D[0])
|
| 67 |
confidences = (sims - sims.min()) / (sims.max() - sims.min() + 1e-9)
|
| 68 |
-
confidence = float(confidences[0])
|
| 69 |
|
| 70 |
-
#
|
| 71 |
final_category = ask_openai(text, candidates)
|
| 72 |
|
| 73 |
return jsonify({
|
|
@@ -76,8 +86,13 @@ def predict_category():
|
|
| 76 |
{"category": c, "confidence": float(conf)}
|
| 77 |
for c, conf in zip(candidates[:5], confidences[:5])
|
| 78 |
],
|
| 79 |
-
"final_confidence":
|
| 80 |
})
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
if __name__ == "__main__":
|
| 83 |
app.run(debug=True)
|
|
|
|
| 1 |
from flask import Flask, request, jsonify, render_template
|
|
|
|
| 2 |
import faiss, numpy as np, pickle, os
|
| 3 |
from openai import OpenAI
|
| 4 |
from dotenv import load_dotenv
|
|
|
|
| 5 |
|
| 6 |
load_dotenv()
|
| 7 |
app = Flask(__name__)
|
| 8 |
|
| 9 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 10 |
+
|
| 11 |
INDEX_PATH = "categories.index"
|
| 12 |
META_PATH = "categories_meta.pkl"
|
| 13 |
|
| 14 |
+
# Load FAISS + metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
index = faiss.read_index(INDEX_PATH)
|
| 16 |
with open(META_PATH, "rb") as f:
|
| 17 |
categories = pickle.load(f)
|
| 18 |
|
| 19 |
+
def embed_query(text, model="text-embedding-3-large"):
|
| 20 |
+
resp = client.embeddings.create(
|
| 21 |
+
model=model,
|
| 22 |
+
input=text
|
| 23 |
+
)
|
| 24 |
+
return np.array(resp.data[0].embedding, dtype="float32").reshape(1, -1)
|
| 25 |
|
| 26 |
def ask_openai(product_text, candidates):
|
| 27 |
prompt = f"""
|
|
|
|
| 30 |
Product details:
|
| 31 |
{product_text}
|
| 32 |
|
| 33 |
+
Candidate categories:
|
| 34 |
{chr(10).join([f"- {c}" for c in candidates])}
|
| 35 |
|
| 36 |
+
Choose the single best category.
|
| 37 |
+
Return ONLY the exact category path.
|
| 38 |
"""
|
| 39 |
+
resp = client.chat.completions.create(
|
| 40 |
model="gpt-4o-mini",
|
|
|
|
| 41 |
temperature=0.2,
|
| 42 |
+
messages=[{"role": "user", "content": prompt}]
|
| 43 |
)
|
| 44 |
+
return resp.choices[0].message.content.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
@app.route("/predict-category", methods=["POST"])
|
| 47 |
def predict_category():
|
| 48 |
data = request.json
|
| 49 |
+
|
| 50 |
title = data.get("title", "")
|
| 51 |
description = data.get("description", "")
|
| 52 |
+
product_type = data.get("product_type", "")
|
| 53 |
+
tags = data.get("tags", [])
|
| 54 |
+
|
| 55 |
+
# Handle tags if passed as a list OR comma-separated string
|
| 56 |
+
if isinstance(tags, list):
|
| 57 |
+
tags_text = ", ".join(tags)
|
| 58 |
+
else:
|
| 59 |
+
tags_text = str(tags)
|
| 60 |
+
|
| 61 |
+
# Combine everything into a single rich text block
|
| 62 |
+
text = f"""
|
| 63 |
+
Title: {title}
|
| 64 |
+
Description: {description}
|
| 65 |
+
Product Type: {product_type}
|
| 66 |
+
Tags: {tags_text}
|
| 67 |
+
""".strip()
|
| 68 |
+
|
| 69 |
+
# Embed using OpenAI
|
| 70 |
+
query_vec = embed_query(text)
|
| 71 |
+
|
| 72 |
+
# Search FAISS
|
| 73 |
+
D, I = index.search(query_vec, 10)
|
| 74 |
candidates = [categories[i]["category_path"] for i in I[0]]
|
| 75 |
+
print(candidates)
|
| 76 |
+
# Confidence scoring
|
| 77 |
sims = 1 / (1 + D[0])
|
| 78 |
confidences = (sims - sims.min()) / (sims.max() - sims.min() + 1e-9)
|
|
|
|
| 79 |
|
| 80 |
+
# GPT refinement
|
| 81 |
final_category = ask_openai(text, candidates)
|
| 82 |
|
| 83 |
return jsonify({
|
|
|
|
| 86 |
{"category": c, "confidence": float(conf)}
|
| 87 |
for c, conf in zip(candidates[:5], confidences[:5])
|
| 88 |
],
|
| 89 |
+
"final_confidence": float(confidences[0])
|
| 90 |
})
|
| 91 |
|
| 92 |
+
@app.route("/")
|
| 93 |
+
def home():
|
| 94 |
+
return render_template("index.html")
|
| 95 |
+
|
| 96 |
+
|
| 97 |
if __name__ == "__main__":
|
| 98 |
app.run(debug=True)
|
build_index.py
CHANGED
|
@@ -1,76 +1,61 @@
|
|
| 1 |
import json
|
| 2 |
import numpy as np
|
| 3 |
import faiss
|
| 4 |
-
from sentence_transformers import SentenceTransformer
|
| 5 |
import pickle
|
| 6 |
-
from tqdm import tqdm
|
| 7 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def build_faiss_index(
|
| 10 |
json_path="categories.json",
|
| 11 |
index_path="categories.index",
|
| 12 |
meta_path="categories_meta.pkl",
|
| 13 |
-
model_name="all-MiniLM-L6-v2",
|
| 14 |
-
batch_size=256
|
| 15 |
):
|
| 16 |
-
""
|
| 17 |
-
Builds a FAISS index from a category JSON file.
|
| 18 |
-
|
| 19 |
-
Args:
|
| 20 |
-
json_path (str): Path to the JSON file containing category data.
|
| 21 |
-
index_path (str): Path to save the FAISS index file.
|
| 22 |
-
meta_path (str): Path to save the metadata pickle file.
|
| 23 |
-
model_name (str): SentenceTransformer model name.
|
| 24 |
-
batch_size (int): Batch size for encoding.
|
| 25 |
-
|
| 26 |
-
Returns:
|
| 27 |
-
tuple: (index_path, meta_path)
|
| 28 |
-
"""
|
| 29 |
-
print("π§ Building FAISS index...")
|
| 30 |
-
|
| 31 |
-
# Load categories
|
| 32 |
if not os.path.exists(json_path):
|
| 33 |
-
raise FileNotFoundError(f"β JSON
|
|
|
|
| 34 |
with open(json_path, "r", encoding="utf-8") as f:
|
| 35 |
categories = json.load(f)
|
| 36 |
|
| 37 |
texts = [c["category_path"] for c in categories]
|
| 38 |
print(f"π¦ Total categories: {len(texts)}")
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
embeddings = []
|
| 43 |
-
|
| 44 |
-
total_batches = (len(texts) + batch_size - 1) // batch_size
|
| 45 |
-
print(f"βοΈ Encoding in {total_batches} batches...")
|
| 46 |
-
|
| 47 |
-
for i in range(0, len(texts), batch_size):
|
| 48 |
-
current_batch = i // batch_size + 1
|
| 49 |
-
print(f"π Batch {current_batch}/{total_batches}")
|
| 50 |
-
batch = texts[i:i + batch_size]
|
| 51 |
-
batch_embeddings = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 52 |
-
embeddings.append(batch_embeddings)
|
| 53 |
-
|
| 54 |
-
embeddings = np.vstack(embeddings)
|
| 55 |
|
| 56 |
# Build FAISS index
|
| 57 |
dim = embeddings.shape[1]
|
| 58 |
index = faiss.IndexFlatL2(dim)
|
| 59 |
index.add(embeddings)
|
| 60 |
|
| 61 |
-
# Save index + metadata
|
| 62 |
faiss.write_index(index, index_path)
|
|
|
|
| 63 |
with open(meta_path, "wb") as f:
|
| 64 |
pickle.dump(categories, f)
|
| 65 |
|
| 66 |
-
print(
|
| 67 |
-
print(f"π Index
|
| 68 |
-
print(f"π Metadata
|
| 69 |
-
print(f"π
|
| 70 |
|
| 71 |
return index_path, meta_path
|
| 72 |
|
| 73 |
|
| 74 |
-
# Optional: run directly to build index
|
| 75 |
if __name__ == "__main__":
|
| 76 |
build_faiss_index()
|
|
|
|
| 1 |
import json
|
| 2 |
import numpy as np
|
| 3 |
import faiss
|
|
|
|
| 4 |
import pickle
|
|
|
|
| 5 |
import os
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 12 |
+
|
| 13 |
+
def embed_text(texts, model="text-embedding-3-large"):
|
| 14 |
+
embeddings = []
|
| 15 |
+
for t in tqdm(texts, desc="Creating OpenAI embeddings"):
|
| 16 |
+
resp = client.embeddings.create(
|
| 17 |
+
model=model,
|
| 18 |
+
input=t
|
| 19 |
+
)
|
| 20 |
+
embeddings.append(resp.data[0].embedding)
|
| 21 |
+
return np.array(embeddings).astype("float32")
|
| 22 |
+
|
| 23 |
|
| 24 |
def build_faiss_index(
|
| 25 |
json_path="categories.json",
|
| 26 |
index_path="categories.index",
|
| 27 |
meta_path="categories_meta.pkl",
|
|
|
|
|
|
|
| 28 |
):
|
| 29 |
+
print("π§ Loading categories...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
if not os.path.exists(json_path):
|
| 31 |
+
raise FileNotFoundError(f"β JSON missing: {json_path}")
|
| 32 |
+
|
| 33 |
with open(json_path, "r", encoding="utf-8") as f:
|
| 34 |
categories = json.load(f)
|
| 35 |
|
| 36 |
texts = [c["category_path"] for c in categories]
|
| 37 |
print(f"π¦ Total categories: {len(texts)}")
|
| 38 |
|
| 39 |
+
print("βοΈ Creating OpenAI embeddingsβ¦")
|
| 40 |
+
embeddings = embed_text(texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# Build FAISS index
|
| 43 |
dim = embeddings.shape[1]
|
| 44 |
index = faiss.IndexFlatL2(dim)
|
| 45 |
index.add(embeddings)
|
| 46 |
|
|
|
|
| 47 |
faiss.write_index(index, index_path)
|
| 48 |
+
|
| 49 |
with open(meta_path, "wb") as f:
|
| 50 |
pickle.dump(categories, f)
|
| 51 |
|
| 52 |
+
print("\nβ
OpenAI + FAISS index created!")
|
| 53 |
+
print(f"π Index: {index_path}")
|
| 54 |
+
print(f"π Metadata: {meta_path}")
|
| 55 |
+
print(f"π Embedding dim: {dim}")
|
| 56 |
|
| 57 |
return index_path, meta_path
|
| 58 |
|
| 59 |
|
|
|
|
| 60 |
if __name__ == "__main__":
|
| 61 |
build_faiss_index()
|
categories_meta.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:058adfdee542ef1ef62b6da38726bf878321ad5e5e13bad7483da65aa93ef33d
|
| 3 |
+
size 3111090
|