Yordann commited on
Commit
00b9c1a
·
verified ·
1 Parent(s): 6900841

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py CHANGED
@@ -11,11 +11,73 @@ from bs4 import BeautifulSoup
11
  import cv2
12
  from io import BytesIO
13
  import torch
 
 
 
 
14
 
15
  login(token=os.getenv("chatbot"))
16
  generator = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
17
  bg_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-bg-en")
18
  en_to_bg = pipeline("translation", model="Helsinki-NLP/opus-mt-en-bg")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Load BLIP for image captioning
21
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
@@ -60,6 +122,12 @@ def generate_response(user_input, top_p, temperature, chat_counter, chatbot, his
60
 
61
  prompt = ""
62
 
 
 
 
 
 
 
63
  # Multimodal additions
64
  if image is not None:
65
  try:
 
11
  import cv2
12
  from io import BytesIO
13
  import torch
14
+ from sentence_transformers import SentenceTransformer
15
+ import numpy as np
16
+ import faiss
17
+
18
 
19
  login(token=os.getenv("chatbot"))
20
  generator = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
21
  bg_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-bg-en")
22
  en_to_bg = pipeline("translation", model="Helsinki-NLP/opus-mt-en-bg")
23
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
24
+
25
+ def load_chunks(path, chunk_size=300):
26
+ with open(path, "r", encoding="utf-8") as f:
27
+ text = f.read()
28
+
29
+ sentences = text.split(". ")
30
+ chunks, chunk = [], ""
31
+
32
+ for sentence in sentences:
33
+ if len(chunk.split()) + len(sentence.split()) < chunk_size:
34
+ chunk += sentence + ". "
35
+ else:
36
+ chunks.append(chunk.strip())
37
+ chunk = sentence + ". "
38
+
39
+ if chunk:
40
+ chunks.append(chunk.strip())
41
+
42
+ return chunks
43
+
44
+ # Load your document chunks
45
+ chunks = load_chunks("MasterBrand Explanation.txt")
46
+
47
+ # Create embeddings
48
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
49
+ embeddings = embedding_model.encode(chunks)
50
+
51
+ # Build FAISS index
52
+ dimension = embeddings[0].shape[0]
53
+ index = faiss.IndexFlatL2(dimension)
54
+ index.add(np.array(embeddings))
55
+
56
+ def search_similar_chunks(query, k=3):
57
+ query_embedding = embedding_model.encode([query])
58
+ distances, indices = index.search(np.array(query_embedding), k)
59
+ return [chunks[i] for i in indices[0]]
60
+
61
+ def generate_answer_with_context(question):
62
+ top_chunks = search_similar_chunks(question)
63
+ context = "\n\n".join(top_chunks)
64
+
65
+ prompt = f"""<s>
66
+ You are a helpful assistant trained on e-commerce and branding content.
67
+
68
+ Use the context below to answer the question.
69
+
70
+ Context:
71
+ {context}
72
+
73
+ Question: {question}
74
+ Answer:"""
75
+
76
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
77
+ outputs = model.generate(**inputs, max_new_tokens=300)
78
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
+
80
+ return response.replace(prompt, "").strip()
81
 
82
  # Load BLIP for image captioning
83
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 
122
 
123
  prompt = ""
124
 
125
+ top_chunks = search_similar_chunks(user_input_translated)
126
+ rag_context = "\n\n".join(top_chunks)
127
+
128
+ prompt += f"[Context from your e-commerce training document]:\n{rag_context}\n\n"
129
+
130
+
131
  # Multimodal additions
132
  if image is not None:
133
  try: