Rohitface commited on
Commit
c90e25b
·
verified ·
1 Parent(s): 0a92d7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -24
app.py CHANGED
@@ -2,48 +2,44 @@ import gradio as gr
2
  import chromadb
3
  from sentence_transformers import SentenceTransformer
4
  from transformers import pipeline
5
- import re # Import the regular expressions library
6
 
7
- # --- 1. Load Models (No changes here) ---
8
  print("Loading sentence-transformer model for retrieval...")
9
  retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
10
  print("Retriever model loaded.")
11
 
12
- print("Loading generative model for answering...")
13
- # Set device to -1 to force CPU, which is more stable on Hugging Face Spaces free tier
14
- generator_pipe = pipeline("text2text-generation", model="google/flan-t5-small", device=-1)
 
15
  print("Generative model loaded.")
 
16
 
17
 
18
  # --- 2. Setup ChromaDB ---
19
  client = chromadb.Client()
20
 
21
  try:
22
- # Using a new collection name to ensure a fresh start
23
  collection = client.create_collection("whatsapp_chat_v2")
24
  print("ChromaDB collection created.")
25
 
26
- # --- Data Loading and NEW, MORE ROBUST CLEANING ---
27
  try:
28
  print("Loading data from my_data.txt...")
29
  with open('my_data.txt', 'r', encoding='utf-8') as f:
30
  lines = [line.strip() for line in f if line.strip()]
31
 
32
- # --- NEW & IMPROVED CLEANING LOGIC ---
33
- # This regex is designed to find the start of the actual message content
34
- # It looks for a pattern like [date, time] author: or date, time - author:
35
- # and captures everything after it.
36
  message_pattern = re.compile(r'^\[?.*?\]?\s*.*?:\s*(.*)')
37
 
38
  cleaned_documents = []
39
  for line in lines:
40
  match = message_pattern.match(line)
41
- # If a match is found, the actual message is in the first group
42
  if match and match.group(1):
43
  cleaned_documents.append(match.group(1).strip())
44
 
45
  if not cleaned_documents:
46
- print("ERROR: Still could not extract any valid messages. Please check the format of 'my_data.txt'.")
47
  cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
48
  else:
49
  print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
@@ -54,7 +50,7 @@ try:
54
  print("Error: my_data.txt not found.")
55
  documents = ["Error: my_data.txt not found. Please make sure the file is uploaded."]
56
 
57
- # --- Batch Processing (No changes here) ---
58
  batch_size = 5000
59
  print("Starting to process and add documents in batches...")
60
  for i in range(0, len(documents), batch_size):
@@ -75,12 +71,12 @@ except ValueError:
75
  print("ChromaDB collection loaded.")
76
 
77
 
78
- # --- 3. Define Chatbot Logic (No changes here) ---
79
  def chatbot_response(message, history):
80
  query_embedding = retriever_model.encode([message]).tolist()
81
  results = collection.query(
82
  query_embeddings=query_embedding,
83
- n_results=5
84
  )
85
  retrieved_documents = results['documents'][0]
86
 
@@ -89,10 +85,9 @@ def chatbot_response(message, history):
89
 
90
  context = "\n- ".join(retrieved_documents)
91
  prompt = f"""
92
- Based on the following excerpts from a WhatsApp chat, please answer the user's question.
93
- Provide a concise, conversational answer. Do not just repeat the excerpts.
94
 
95
- Chat Excerpts:
96
  - {context}
97
 
98
  Question:
@@ -101,18 +96,18 @@ def chatbot_response(message, history):
101
  Answer:
102
  """
103
 
104
- generated_text = generator_pipe(prompt, max_length=100, num_beams=5, early_stopping=True)
105
  response = generated_text[0]['generated_text']
106
 
107
  return response
108
 
109
- # --- 4. Create the Gradio Interface (No changes here) ---
110
  iface = gr.ChatInterface(
111
  fn=chatbot_response,
112
- title="WhatsApp Chat Bot 💬",
113
- description="Ask me anything about this WhatsApp chat history.",
114
  theme="soft",
115
- examples=["What was discussed about the project?", "When is the next meeting?"],
116
  cache_examples=False
117
  )
118
 
 
2
  import chromadb
3
  from sentence_transformers import SentenceTransformer
4
  from transformers import pipeline
5
+ import re
6
 
7
+ # --- 1. Load Models ---
8
  print("Loading sentence-transformer model for retrieval...")
9
  retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
10
  print("Retriever model loaded.")
11
 
12
+ # --- THIS IS THE UPDATED LINE ---
13
+ print("Loading generative model for answering (google/flan-t5-base)...")
14
+ # Using the balanced 'base' model for better performance and reliability.
15
+ generator_pipe = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)
16
  print("Generative model loaded.")
17
+ # --- END OF UPDATE ---
18
 
19
 
20
  # --- 2. Setup ChromaDB ---
21
  client = chromadb.Client()
22
 
23
  try:
 
24
  collection = client.create_collection("whatsapp_chat_v2")
25
  print("ChromaDB collection created.")
26
 
27
+ # --- Data Loading and Cleaning ---
28
  try:
29
  print("Loading data from my_data.txt...")
30
  with open('my_data.txt', 'r', encoding='utf-8') as f:
31
  lines = [line.strip() for line in f if line.strip()]
32
 
 
 
 
 
33
  message_pattern = re.compile(r'^\[?.*?\]?\s*.*?:\s*(.*)')
34
 
35
  cleaned_documents = []
36
  for line in lines:
37
  match = message_pattern.match(line)
 
38
  if match and match.group(1):
39
  cleaned_documents.append(match.group(1).strip())
40
 
41
  if not cleaned_documents:
42
+ print("ERROR: Could not extract any valid messages from my_data.txt.")
43
  cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
44
  else:
45
  print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
 
50
  print("Error: my_data.txt not found.")
51
  documents = ["Error: my_data.txt not found. Please make sure the file is uploaded."]
52
 
53
+ # --- Batch Processing ---
54
  batch_size = 5000
55
  print("Starting to process and add documents in batches...")
56
  for i in range(0, len(documents), batch_size):
 
71
  print("ChromaDB collection loaded.")
72
 
73
 
74
+ # --- 3. Define Chatbot Logic ---
75
  def chatbot_response(message, history):
76
  query_embedding = retriever_model.encode([message]).tolist()
77
  results = collection.query(
78
  query_embeddings=query_embedding,
79
+ n_results=5 # Using 5 results is a good balance for the base model
80
  )
81
  retrieved_documents = results['documents'][0]
82
 
 
85
 
86
  context = "\n- ".join(retrieved_documents)
87
  prompt = f"""
88
+ Based on the following excerpts from a WhatsApp chat, provide a helpful and accurate answer to the user's question.
 
89
 
90
+ Chat Context:
91
  - {context}
92
 
93
  Question:
 
96
  Answer:
97
  """
98
 
99
+ generated_text = generator_pipe(prompt, max_length=150, num_beams=5, early_stopping=True)
100
  response = generated_text[0]['generated_text']
101
 
102
  return response
103
 
104
+ # --- 4. Create the Gradio Interface ---
105
  iface = gr.ChatInterface(
106
  fn=chatbot_response,
107
+ title="WhatsApp Chat Bot ⚡️",
108
+ description="Ask me anything about this WhatsApp chat history. (Powered by flan-t5-base)",
109
  theme="soft",
110
+ examples=["What was the final decision on the project deadline?", "Summarize the conversation about the event."],
111
  cache_examples=False
112
  )
113