Rohitface commited on
Commit
0a92d7a
·
verified ·
1 Parent(s): 24ee0b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -27
app.py CHANGED
@@ -4,15 +4,14 @@ from sentence_transformers import SentenceTransformer
4
  from transformers import pipeline
5
  import re # Import the regular expressions library
6
 
7
- # --- 1. Load Models ---
8
  print("Loading sentence-transformer model for retrieval...")
9
- # This model is for finding relevant chat lines
10
  retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
11
  print("Retriever model loaded.")
12
 
13
  print("Loading generative model for answering...")
14
- # This model will generate the actual answers
15
- generator_pipe = pipeline("text2text-generation", model="google/flan-t5-small")
16
  print("Generative model loaded.")
17
 
18
 
@@ -20,34 +19,32 @@ print("Generative model loaded.")
20
  client = chromadb.Client()
21
 
22
  try:
23
- collection = client.create_collection("whatsapp_chat")
 
24
  print("ChromaDB collection created.")
25
 
26
- # --- Data Loading and CLEANING ---
27
  try:
28
  print("Loading data from my_data.txt...")
29
  with open('my_data.txt', 'r', encoding='utf-8') as f:
30
  lines = [line.strip() for line in f if line.strip()]
31
 
32
- # --- NEW: Clean the chat data ---
33
- # This pattern removes the date, time, and author (e.g., "M/D/YY, HH:MM - Author:")
34
- # It keeps only the actual message content.
 
 
 
35
  cleaned_documents = []
36
  for line in lines:
37
- # Find the position of the first ':'
38
- first_colon_pos = line.find(':')
39
- if first_colon_pos != -1:
40
- # Find the position of ' - ' before the colon
41
- separator_pos = line.rfind(' - ', 0, first_colon_pos)
42
- if separator_pos != -1:
43
- # Extract the message part
44
- message = line[first_colon_pos + 1:].strip()
45
- if message: # Ensure the message is not empty
46
- cleaned_documents.append(message)
47
 
48
  if not cleaned_documents:
49
- print("Warning: Could not extract any valid messages from my_data.txt.")
50
- cleaned_documents = ["Error: The data file 'my_data.txt' appears to have no valid messages."]
51
  else:
52
  print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
53
 
@@ -74,24 +71,22 @@ try:
74
  print("All documents have been successfully added to ChromaDB.")
75
 
76
  except ValueError:
77
- collection = client.get_collection("whatsapp_chat")
78
  print("ChromaDB collection loaded.")
79
 
80
 
81
- # --- 3. Define the NEW Chatbot Logic ---
82
  def chatbot_response(message, history):
83
- # 1. Retrieve relevant documents from ChromaDB
84
  query_embedding = retriever_model.encode([message]).tolist()
85
  results = collection.query(
86
  query_embeddings=query_embedding,
87
- n_results=5 # Retrieve more context, e.g., 5 lines
88
  )
89
  retrieved_documents = results['documents'][0]
90
 
91
  if not retrieved_documents or "Error:" in retrieved_documents[0]:
92
  return "I'm sorry, I couldn't find any relevant information in the chat history. 🤔"
93
 
94
- # 2. Augment the prompt for the generative model
95
  context = "\n- ".join(retrieved_documents)
96
  prompt = f"""
97
  Based on the following excerpts from a WhatsApp chat, please answer the user's question.
@@ -106,7 +101,6 @@ def chatbot_response(message, history):
106
  Answer:
107
  """
108
 
109
- # 3. Generate the final response
110
  generated_text = generator_pipe(prompt, max_length=100, num_beams=5, early_stopping=True)
111
  response = generated_text[0]['generated_text']
112
 
 
4
  from transformers import pipeline
5
  import re # Import the regular expressions library
6
 
7
+ # --- 1. Load Models (No changes here) ---
8
  print("Loading sentence-transformer model for retrieval...")
 
9
  retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
10
  print("Retriever model loaded.")
11
 
12
  print("Loading generative model for answering...")
13
+ # Set device to -1 to force CPU, which is more stable on Hugging Face Spaces free tier
14
+ generator_pipe = pipeline("text2text-generation", model="google/flan-t5-small", device=-1)
15
  print("Generative model loaded.")
16
 
17
 
 
19
  client = chromadb.Client()
20
 
21
  try:
22
+ # Using a new collection name to ensure a fresh start
23
+ collection = client.create_collection("whatsapp_chat_v2")
24
  print("ChromaDB collection created.")
25
 
26
+ # --- Data Loading and NEW, MORE ROBUST CLEANING ---
27
  try:
28
  print("Loading data from my_data.txt...")
29
  with open('my_data.txt', 'r', encoding='utf-8') as f:
30
  lines = [line.strip() for line in f if line.strip()]
31
 
32
+ # --- NEW & IMPROVED CLEANING LOGIC ---
33
+ # This regex is designed to find the start of the actual message content
34
+ # It looks for a pattern like [date, time] author: or date, time - author:
35
+ # and captures everything after it.
36
+ message_pattern = re.compile(r'^\[?.*?\]?\s*.*?:\s*(.*)')
37
+
38
  cleaned_documents = []
39
  for line in lines:
40
+ match = message_pattern.match(line)
41
+ # If a match is found, the actual message is in the first group
42
+ if match and match.group(1):
43
+ cleaned_documents.append(match.group(1).strip())
 
 
 
 
 
 
44
 
45
  if not cleaned_documents:
46
+ print("ERROR: Still could not extract any valid messages. Please check the format of 'my_data.txt'.")
47
+ cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
48
  else:
49
  print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
50
 
 
71
  print("All documents have been successfully added to ChromaDB.")
72
 
73
  except ValueError:
74
+ collection = client.get_collection("whatsapp_chat_v2")
75
  print("ChromaDB collection loaded.")
76
 
77
 
78
+ # --- 3. Define Chatbot Logic (No changes here) ---
79
  def chatbot_response(message, history):
 
80
  query_embedding = retriever_model.encode([message]).tolist()
81
  results = collection.query(
82
  query_embeddings=query_embedding,
83
+ n_results=5
84
  )
85
  retrieved_documents = results['documents'][0]
86
 
87
  if not retrieved_documents or "Error:" in retrieved_documents[0]:
88
  return "I'm sorry, I couldn't find any relevant information in the chat history. 🤔"
89
 
 
90
  context = "\n- ".join(retrieved_documents)
91
  prompt = f"""
92
  Based on the following excerpts from a WhatsApp chat, please answer the user's question.
 
101
  Answer:
102
  """
103
 
 
104
  generated_text = generator_pipe(prompt, max_length=100, num_beams=5, early_stopping=True)
105
  response = generated_text[0]['generated_text']
106