Spaces:

Rohitface
/

Charbot

Sleeping

App Files Files Community

Rohitface commited on Aug 18, 2025

Commit

c90e25b

verified ·

1 Parent(s): 0a92d7a

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -24

app.py CHANGED Viewed

@@ -2,48 +2,44 @@ import gradio as gr
 import chromadb
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
-import re # Import the regular expressions library
-# --- 1. Load Models (No changes here) ---
 print("Loading sentence-transformer model for retrieval...")
 retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
 print("Retriever model loaded.")
-print("Loading generative model for answering...")
-# Set device to -1 to force CPU, which is more stable on Hugging Face Spaces free tier
-generator_pipe = pipeline("text2text-generation", model="google/flan-t5-small", device=-1)
 print("Generative model loaded.")
 # --- 2. Setup ChromaDB ---
 client = chromadb.Client()
 try:
-    # Using a new collection name to ensure a fresh start
     collection = client.create_collection("whatsapp_chat_v2")
     print("ChromaDB collection created.")
-    # --- Data Loading and NEW, MORE ROBUST CLEANING ---
     try:
         print("Loading data from my_data.txt...")
         with open('my_data.txt', 'r', encoding='utf-8') as f:
             lines = [line.strip() for line in f if line.strip()]
-        # --- NEW & IMPROVED CLEANING LOGIC ---
-        # This regex is designed to find the start of the actual message content
-        # It looks for a pattern like [date, time] author: or date, time - author:
-        # and captures everything after it.
         message_pattern = re.compile(r'^\[?.*?\]?\s*.*?:\s*(.*)')
         cleaned_documents = []
         for line in lines:
             match = message_pattern.match(line)
-            # If a match is found, the actual message is in the first group
             if match and match.group(1):
                 cleaned_documents.append(match.group(1).strip())
         if not cleaned_documents:
-            print("ERROR: Still could not extract any valid messages. Please check the format of 'my_data.txt'.")
             cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
         else:
             print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
@@ -54,7 +50,7 @@ try:
         print("Error: my_data.txt not found.")
         documents = ["Error: my_data.txt not found. Please make sure the file is uploaded."]
-    # --- Batch Processing (No changes here) ---
     batch_size = 5000
     print("Starting to process and add documents in batches...")
     for i in range(0, len(documents), batch_size):
@@ -75,12 +71,12 @@ except ValueError:
     print("ChromaDB collection loaded.")
-# --- 3. Define Chatbot Logic (No changes here) ---
 def chatbot_response(message, history):
     query_embedding = retriever_model.encode([message]).tolist()
     results = collection.query(
         query_embeddings=query_embedding,
-        n_results=5
     )
     retrieved_documents = results['documents'][0]
@@ -89,10 +85,9 @@ def chatbot_response(message, history):
     context = "\n- ".join(retrieved_documents)
     prompt = f"""
-    Based on the following excerpts from a WhatsApp chat, please answer the user's question.
-    Provide a concise, conversational answer. Do not just repeat the excerpts.
-    Chat Excerpts:
     - {context}
     Question:
@@ -101,18 +96,18 @@ def chatbot_response(message, history):
     Answer:
     """
-    generated_text = generator_pipe(prompt, max_length=100, num_beams=5, early_stopping=True)
     response = generated_text[0]['generated_text']
     return response
-# --- 4. Create the Gradio Interface (No changes here) ---
 iface = gr.ChatInterface(
     fn=chatbot_response,
-    title="WhatsApp Chat Bot 💬",
-    description="Ask me anything about this WhatsApp chat history.",
     theme="soft",
-    examples=["What was discussed about the project?", "When is the next meeting?"],
     cache_examples=False
 )

 import chromadb
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
+import re
+# --- 1. Load Models ---
 print("Loading sentence-transformer model for retrieval...")
 retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
 print("Retriever model loaded.")
+# --- THIS IS THE UPDATED LINE ---
+print("Loading generative model for answering (google/flan-t5-base)...")
+# Using the balanced 'base' model for better performance and reliability.
+generator_pipe = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)
 print("Generative model loaded.")
+# --- END OF UPDATE ---
 # --- 2. Setup ChromaDB ---
 client = chromadb.Client()
 try:
     collection = client.create_collection("whatsapp_chat_v2")
     print("ChromaDB collection created.")
+    # --- Data Loading and Cleaning ---
     try:
         print("Loading data from my_data.txt...")
         with open('my_data.txt', 'r', encoding='utf-8') as f:
             lines = [line.strip() for line in f if line.strip()]
         message_pattern = re.compile(r'^\[?.*?\]?\s*.*?:\s*(.*)')
         cleaned_documents = []
         for line in lines:
             match = message_pattern.match(line)
             if match and match.group(1):
                 cleaned_documents.append(match.group(1).strip())
         if not cleaned_documents:
+            print("ERROR: Could not extract any valid messages from my_data.txt.")
             cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
         else:
             print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
         print("Error: my_data.txt not found.")
         documents = ["Error: my_data.txt not found. Please make sure the file is uploaded."]
+    # --- Batch Processing ---
     batch_size = 5000
     print("Starting to process and add documents in batches...")
     for i in range(0, len(documents), batch_size):
     print("ChromaDB collection loaded.")
+# --- 3. Define Chatbot Logic ---
 def chatbot_response(message, history):
     query_embedding = retriever_model.encode([message]).tolist()
     results = collection.query(
         query_embeddings=query_embedding,
+        n_results=5 # Using 5 results is a good balance for the base model
     )
     retrieved_documents = results['documents'][0]
     context = "\n- ".join(retrieved_documents)
     prompt = f"""
+    Based on the following excerpts from a WhatsApp chat, provide a helpful and accurate answer to the user's question.
+    Chat Context:
     - {context}
     Question:
     Answer:
     """
+    generated_text = generator_pipe(prompt, max_length=150, num_beams=5, early_stopping=True)
     response = generated_text[0]['generated_text']
     return response
+# --- 4. Create the Gradio Interface ---
 iface = gr.ChatInterface(
     fn=chatbot_response,
+    title="WhatsApp Chat Bot ⚡️",
+    description="Ask me anything about this WhatsApp chat history. (Powered by flan-t5-base)",
     theme="soft",
+    examples=["What was the final decision on the project deadline?", "Summarize the conversation about the event."],
     cache_examples=False
 )