Spaces:

midrees2806
/

UoeChatbot

Sleeping

App Files Files Community

UoeChatbot / rag.py

midrees2806

Update rag.py

bc54961 verified 24 days ago

raw

history blame contribute delete

6.12 kB

	import json
	import glob
	import os
	import random
	import pandas as pd
	from datetime import datetime
	from dotenv import load_dotenv

	# Core AI Libraries
	from sentence_transformers import SentenceTransformer, util
	from groq import Groq
	from datasets import load_dataset, Dataset

	# Load environment variables
	load_dotenv()

	# Initialize Groq client
	groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

	# Load similarity model
	similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

	# Config
	HF_DATASET_REPO = "midrees2806/unmatched_queries"
	HF_TOKEN = os.getenv("HF_TOKEN")

	# Greeting list
	GREETINGS = ["hi", "hello", "hey", "good morning", "good afternoon", "good evening", "assalam o alaikum", "salam", "aoa", "hi there", "hey there", "greetings"]

	# Load multiple JSON datasets
	dataset = []
	try:
	json_files = glob.glob('datasets/*.json')
	for file_path in json_files:
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	if isinstance(data, list):
	dataset.extend([item for item in data if isinstance(item, dict) and 'Question' in item and 'Answer' in item])
	except Exception as e:
	print(f"Error loading datasets: {e}")

	# Precompute embeddings
	dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
	dataset_answers = [item.get("Answer", "") for item in dataset]
	dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)

	def manage_unmatched_queries(query: str):
	try:
	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	try:
	ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
	df = ds["train"].to_pandas()
	except:
	df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])

	if query not in df["Query"].values:
	new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
	df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
	updated_ds = Dataset.from_pandas(df)
	updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
	except Exception as e:
	print(f"Failed to save query: {e}")

	def query_groq_llm(prompt):
	try:
	# Llama-3.3-70b-versatile with streaming enabled
	completion = groq_client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[
	{
	"role": "system",
	"content": "You are the official UOE AI Assistant. Your goal is to provide highly structured, professional, and easy-to-read information about the University of Education Lahore."
	},
	{
	"role": "user",
	"content": prompt
	}
	],
	temperature=1,
	max_completion_tokens=1024,
	top_p=1,
	stream=True,
	stop=None
	)

	full_response = ""
	for chunk in completion:
	content = chunk.choices[0].delta.content or ""
	full_response += content

	return full_response.strip()

	except Exception as e:
	print(f"Groq API Error: {e}")
	return None

	def get_best_answer(user_input):
	if not user_input.strip():
	return "Please enter a valid question."

	user_input_lower = user_input.lower().strip()

	# Greeting Handling
	if any(greet == user_input_lower for greet in GREETINGS):
	return "Hello! I am the UOE AI Assistant. How can I help you regarding admissions, fees, or programs today?"

	# Length check
	if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS):
	return "Please provide more details. I need at least 3 words to understand your query properly."

	# # Direct Fee Link
	# if any(kw in user_input_lower for kw in ["fee structure", "fees structure", "semester fees", "semester fee"]):
	# return (
	# "💰 Official Fee Structure\n\n"
	# "For the most accurate and up-to-date fee details, please visit the official University of Education link:\n"
	# "🔗 https://ue.edu.pk/allfeestructure.php"
	# )

	# Similarity Calculation
	user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
	similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
	best_match_idx = similarities.argmax().item()
	best_score = similarities[best_match_idx].item()

	if best_score >= 0.65:
	# --- PATH 1: DATASET MATCH (Enhanced Formatting) ---
	original_answer = dataset_answers[best_match_idx]
	prompt = f"""You are the official University of Education (UOE) Assistant.
	I have found a verified answer in our database. Please rephrase it to make it extremely professional and user-friendly.

	STRICT GUIDELINES:
	1. Use clear Headings.
	2. Use Bullet Points for lists.
	3. If there is data like timing, criteria, or contact info, present it in a Markdown Table.
	4. Bold important keywords.
	5. Maintain a polite and welcoming tone.

	User Question: {user_input}
	Verified Data: {original_answer}

	Enhanced Response:"""
	else:
	# --- PATH 2: GENERAL KNOWLEDGE (with Forwarding Note) ---
	manage_unmatched_queries(user_input)
	prompt = f"""You are the UOE AI Assistant for University of Education Lahore.
	The user asked: "{user_input}". This query is not in our verified database.

	Instructions:
	1. Answer based on your general knowledge about UOE Lahore.
	2. Format the response with headings and bold text.
	3. At the end, you MUST add this exact text:
	"📢 Note: Your query has been forwarded to our support team. We are currently updating our verified database to provide a verified answer next time."
	4. List official contacts: Website: https://ue.edu.pk \| Phone: +92-42-99262231-33"""

	return query_groq_llm(prompt)