midrees2806 commited on
Commit
171c15b
ยท
verified ยท
1 Parent(s): 5fcb2c5

Update rag.py

Browse files
Files changed (1) hide show
  1. rag.py +20 -37
rag.py CHANGED
@@ -6,6 +6,7 @@ import os
6
  import pandas as pd
7
  from datasets import load_dataset, Dataset
8
  from dotenv import load_dotenv
 
9
  import glob
10
 
11
  # Load environment variables
@@ -28,7 +29,15 @@ GREETINGS = [
28
  "hey there", "greetings"
29
  ]
30
 
31
- # Load multiple JSON datasets from the datasets folder
 
 
 
 
 
 
 
 
32
  dataset = []
33
  try:
34
  json_files = glob.glob('datasets/*.json')
@@ -49,10 +58,7 @@ except Exception as e:
49
  # Precompute embeddings
50
  dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
51
  dataset_answers = [item.get("Answer", "") for item in dataset]
52
- if dataset_questions:
53
- dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
54
- else:
55
- dataset_embeddings = None
56
 
57
  # Save unmatched queries to Hugging Face
58
  def manage_unmatched_queries(query: str):
@@ -63,7 +69,6 @@ def manage_unmatched_queries(query: str):
63
  df = ds["train"].to_pandas()
64
  except:
65
  df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
66
-
67
  if query not in df["Query"].values:
68
  new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
69
  df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
@@ -89,18 +94,16 @@ def query_groq_llm(prompt, model_name="llama3-70b-8192"):
89
  print(f"Error querying Groq API: {e}")
90
  return ""
91
 
92
- # Main logic function
93
  def get_best_answer(user_input):
94
  if not user_input.strip():
95
  return "Please enter a valid question."
96
 
97
  user_input_lower = user_input.lower().strip()
98
 
99
- # Basic length check unless it's a greeting
100
  if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS):
101
  return "Please ask your question properly with at least 3 words."
102
 
103
- # Specific Keyword Check for Fees
104
  if any(keyword in user_input_lower for keyword in ["fee structure", "fees structure", "semester fees", "semester fee"]):
105
  return (
106
  "๐Ÿ’ฐ For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n"
@@ -108,53 +111,33 @@ def get_best_answer(user_input):
108
  "๐Ÿ”— https://ue.edu.pk/allfeestructure.php"
109
  )
110
 
111
- # Calculate Similarity
112
  user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
113
  similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
114
  best_match_idx = similarities.argmax().item()
115
  best_score = similarities[best_match_idx].item()
116
 
117
- # LOGIC BRANCHING
118
- if best_score >= 0.65:
119
- # PATH 1: Dataset Match - Rephrase the Answer
120
- original_answer = dataset_answers[best_match_idx]
121
- prompt = f"""Name is UOE AI Assistant! You are an official assistant for the University of Education Lahore.
 
122
  Rephrase the following official answer clearly and professionally.
123
- Use structured formatting (like headings, bullet points, or numbered lists) where appropriate to make it attractive.
124
  DO NOT add any new or extra information. ONLY rephrase and improve the clarity and formatting of the original answer.
125
  ### Question:
126
  {user_input}
127
  ### Original Answer:
128
  {original_answer}
129
  ### Rephrased Answer:
130
- """
131
- else:
132
- # PATH 2: No Match - Answer from LLM Knowledge and Log
133
- manage_unmatched_queries(user_input)
134
- prompt = f"""Name is UOE AI Assistant! As an official assistant for University of Education Lahore, provide a helpful and professional response based on university standards.
135
- Include relevant details about university policies if known.
136
- If unsure about specific dates or numbers, direct the user to official channels.
137
- ### Question:
138
- {user_input}
139
- ### Official Answer:
140
  """
141
 
142
  llm_response = query_groq_llm(prompt)
143
 
144
  if llm_response:
145
- # Clean up markers if the LLM includes them in the output
146
  for marker in ["Improved Answer:", "Official Answer:", "Rephrased Answer:"]:
147
  if marker in llm_response:
148
  return llm_response.split(marker)[-1].strip()
149
  return llm_response
150
  else:
151
- # Fallback if Groq fails
152
- if best_score >= 0.65:
153
- return dataset_answers[best_match_idx]
154
- else:
155
- return (
156
- "For official information:\n"
157
- "๐Ÿ“ž +92-42-99262231-33\n"
158
- "โœ‰๏ธ info@ue.edu.pk\n"
159
- "๐ŸŒ https://ue.edu.pk"
160
- )
 
6
  import pandas as pd
7
  from datasets import load_dataset, Dataset
8
  from dotenv import load_dotenv
9
+ import random
10
  import glob
11
 
12
  # Load environment variables
 
29
  "hey there", "greetings"
30
  ]
31
 
32
+ # Fixed rephrased unmatched query responses
33
+ UNMATCHED_RESPONSES = [
34
+ "Thank you for your query. Weโ€™ve forwarded it to our support team and it will be added soon. In the meantime, you can visit the University of Education official website or reach out via the contact details below.\n\n๐Ÿ“ž +92-42-99262231-33\nโœ‰๏ธ info@ue.edu.pk\n๐ŸŒ https://ue.edu.pk",
35
+ "Weโ€™ve noted your question and itโ€™s in queue for inclusion. For now, please check the University of Education website or contact the administration directly.\n\n๐Ÿ“ž +92-42-99262231-33\nโœ‰๏ธ info@ue.edu.pk\n๐ŸŒ https://ue.edu.pk",
36
+ "Your query has been recorded. Weโ€™ll update the system with relevant information shortly. Meanwhile, you can visit UE's official site or reach out using the details below:\n\n๐Ÿ“ž +92-42-99262231-33\nโœ‰๏ธ info@ue.edu.pk\n๐ŸŒ https://ue.edu.pk",
37
+ "We appreciate your question. It has been forwarded for further processing. Until itโ€™s available here, feel free to visit the official UE website or use the contact options:\n\n๐Ÿ“ž +92-42-99262231-33\nโœ‰๏ธ info@ue.edu.pk\n๐ŸŒ https://ue.edu.pk"
38
+ ]
39
+
40
+ # Load multiple JSON datasets
41
  dataset = []
42
  try:
43
  json_files = glob.glob('datasets/*.json')
 
58
  # Precompute embeddings
59
  dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
60
  dataset_answers = [item.get("Answer", "") for item in dataset]
61
+ dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
 
 
 
62
 
63
  # Save unmatched queries to Hugging Face
64
  def manage_unmatched_queries(query: str):
 
69
  df = ds["train"].to_pandas()
70
  except:
71
  df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
 
72
  if query not in df["Query"].values:
73
  new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
74
  df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
 
94
  print(f"Error querying Groq API: {e}")
95
  return ""
96
 
97
+ # Main logic function to be called from Gradio
98
  def get_best_answer(user_input):
99
  if not user_input.strip():
100
  return "Please enter a valid question."
101
 
102
  user_input_lower = user_input.lower().strip()
103
 
 
104
  if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS):
105
  return "Please ask your question properly with at least 3 words."
106
 
 
107
  if any(keyword in user_input_lower for keyword in ["fee structure", "fees structure", "semester fees", "semester fee"]):
108
  return (
109
  "๐Ÿ’ฐ For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n"
 
111
  "๐Ÿ”— https://ue.edu.pk/allfeestructure.php"
112
  )
113
 
 
114
  user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
115
  similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
116
  best_match_idx = similarities.argmax().item()
117
  best_score = similarities[best_match_idx].item()
118
 
119
+ if best_score < 0.65:
120
+ manage_unmatched_queries(user_input)
121
+ return random.choice(UNMATCHED_RESPONSES)
122
+
123
+ original_answer = dataset_answers[best_match_idx]
124
+ prompt = f"""Name is UOE AI Assistant! You are an official assistant for the University of Education Lahore.
125
  Rephrase the following official answer clearly and professionally.
126
+ Use structured formatting (like headings, bullet points, or numbered lists) where appropriate.
127
  DO NOT add any new or extra information. ONLY rephrase and improve the clarity and formatting of the original answer.
128
  ### Question:
129
  {user_input}
130
  ### Original Answer:
131
  {original_answer}
132
  ### Rephrased Answer:
 
 
 
 
 
 
 
 
 
 
133
  """
134
 
135
  llm_response = query_groq_llm(prompt)
136
 
137
  if llm_response:
 
138
  for marker in ["Improved Answer:", "Official Answer:", "Rephrased Answer:"]:
139
  if marker in llm_response:
140
  return llm_response.split(marker)[-1].strip()
141
  return llm_response
142
  else:
143
+ return dataset_answers[best_match_idx]