Spaces:

Rohitface
/

Charbot

Sleeping

App Files Files Community

Rohitface commited on Aug 18, 2025

Commit

0a92d7a

verified ·

1 Parent(s): 24ee0b7

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -27

app.py CHANGED Viewed

@@ -4,15 +4,14 @@ from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 import re # Import the regular expressions library
-# --- 1. Load Models ---
 print("Loading sentence-transformer model for retrieval...")
-# This model is for finding relevant chat lines
 retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
 print("Retriever model loaded.")
 print("Loading generative model for answering...")
-# This model will generate the actual answers
-generator_pipe = pipeline("text2text-generation", model="google/flan-t5-small")
 print("Generative model loaded.")
@@ -20,34 +19,32 @@ print("Generative model loaded.")
 client = chromadb.Client()
 try:
-    collection = client.create_collection("whatsapp_chat")
     print("ChromaDB collection created.")
-    # --- Data Loading and CLEANING ---
     try:
         print("Loading data from my_data.txt...")
         with open('my_data.txt', 'r', encoding='utf-8') as f:
             lines = [line.strip() for line in f if line.strip()]
-        # --- NEW: Clean the chat data ---
-        # This pattern removes the date, time, and author (e.g., "M/D/YY, HH:MM - Author:")
-        # It keeps only the actual message content.
         cleaned_documents = []
         for line in lines:
-            # Find the position of the first ':'
-            first_colon_pos = line.find(':')
-            if first_colon_pos != -1:
-                # Find the position of ' - ' before the colon
-                separator_pos = line.rfind(' - ', 0, first_colon_pos)
-                if separator_pos != -1:
-                    # Extract the message part
-                    message = line[first_colon_pos + 1:].strip()
-                    if message: # Ensure the message is not empty
-                        cleaned_documents.append(message)
         if not cleaned_documents:
-            print("Warning: Could not extract any valid messages from my_data.txt.")
-            cleaned_documents = ["Error: The data file 'my_data.txt' appears to have no valid messages."]
         else:
             print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
@@ -74,24 +71,22 @@ try:
     print("All documents have been successfully added to ChromaDB.")
 except ValueError:
-    collection = client.get_collection("whatsapp_chat")
     print("ChromaDB collection loaded.")
-# --- 3. Define the NEW Chatbot Logic ---
 def chatbot_response(message, history):
-    # 1. Retrieve relevant documents from ChromaDB
     query_embedding = retriever_model.encode([message]).tolist()
     results = collection.query(
         query_embeddings=query_embedding,
-        n_results=5 # Retrieve more context, e.g., 5 lines
     )
     retrieved_documents = results['documents'][0]
     if not retrieved_documents or "Error:" in retrieved_documents[0]:
         return "I'm sorry, I couldn't find any relevant information in the chat history. 🤔"
-    # 2. Augment the prompt for the generative model
     context = "\n- ".join(retrieved_documents)
     prompt = f"""
     Based on the following excerpts from a WhatsApp chat, please answer the user's question.
@@ -106,7 +101,6 @@ def chatbot_response(message, history):
     Answer:
     """
-    # 3. Generate the final response
     generated_text = generator_pipe(prompt, max_length=100, num_beams=5, early_stopping=True)
     response = generated_text[0]['generated_text']

 from transformers import pipeline
 import re # Import the regular expressions library
+# --- 1. Load Models (No changes here) ---
 print("Loading sentence-transformer model for retrieval...")
 retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
 print("Retriever model loaded.")
 print("Loading generative model for answering...")
+# Set device to -1 to force CPU, which is more stable on Hugging Face Spaces free tier
+generator_pipe = pipeline("text2text-generation", model="google/flan-t5-small", device=-1)
 print("Generative model loaded.")
 client = chromadb.Client()
 try:
+    # Using a new collection name to ensure a fresh start
+    collection = client.create_collection("whatsapp_chat_v2")
     print("ChromaDB collection created.")
+    # --- Data Loading and NEW, MORE ROBUST CLEANING ---
     try:
         print("Loading data from my_data.txt...")
         with open('my_data.txt', 'r', encoding='utf-8') as f:
             lines = [line.strip() for line in f if line.strip()]
+        # --- NEW & IMPROVED CLEANING LOGIC ---
+        # This regex is designed to find the start of the actual message content
+        # It looks for a pattern like [date, time] author: or date, time - author:
+        # and captures everything after it.
+        message_pattern = re.compile(r'^\[?.*?\]?\s*.*?:\s*(.*)')
         cleaned_documents = []
         for line in lines:
+            match = message_pattern.match(line)
+            # If a match is found, the actual message is in the first group
+            if match and match.group(1):
+                cleaned_documents.append(match.group(1).strip())
         if not cleaned_documents:
+            print("ERROR: Still could not extract any valid messages. Please check the format of 'my_data.txt'.")
+            cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
         else:
             print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
     print("All documents have been successfully added to ChromaDB.")
 except ValueError:
+    collection = client.get_collection("whatsapp_chat_v2")
     print("ChromaDB collection loaded.")
+# --- 3. Define Chatbot Logic (No changes here) ---
 def chatbot_response(message, history):
     query_embedding = retriever_model.encode([message]).tolist()
     results = collection.query(
         query_embeddings=query_embedding,
+        n_results=5
     )
     retrieved_documents = results['documents'][0]
     if not retrieved_documents or "Error:" in retrieved_documents[0]:
         return "I'm sorry, I couldn't find any relevant information in the chat history. 🤔"
     context = "\n- ".join(retrieved_documents)
     prompt = f"""
     Based on the following excerpts from a WhatsApp chat, please answer the user's question.
     Answer:
     """
     generated_text = generator_pipe(prompt, max_length=100, num_beams=5, early_stopping=True)
     response = generated_text[0]['generated_text']