Spaces:
Sleeping
Sleeping
| import json | |
| from sentence_transformers import SentenceTransformer, util | |
| from groq import Groq | |
| from datetime import datetime | |
| import os | |
| import pandas as pd | |
| from datasets import load_dataset, Dataset | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize Groq client | |
| groq_client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| # Load similarity model | |
| similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') | |
| # Config | |
| HF_DATASET_REPO = "midrees2806/unmatched_queries" | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Greeting list | |
| GREETINGS = [ | |
| "hi", "hello", "hey", "good morning", "good afternoon", "good evening", | |
| "assalam o alaikum", "salam", "aoa", "hi there", | |
| "hey there", "greetings" | |
| ] | |
| # Load local dataset | |
| try: | |
| with open('dataset.json', 'r') as f: | |
| dataset = json.load(f) | |
| if not all(isinstance(item, dict) and 'Question' in item and 'Answer' in item for item in dataset): | |
| raise ValueError("Invalid dataset structure") | |
| except Exception as e: | |
| print(f"Error loading dataset: {e}") | |
| dataset = [] | |
| # Precompute embeddings | |
| dataset_questions = [item.get("Question", "").lower().strip() for item in dataset] | |
| dataset_answers = [item.get("Answer", "") for item in dataset] | |
| dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True) | |
| # Save unmatched queries to Hugging Face | |
| def manage_unmatched_queries(query: str): | |
| try: | |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| try: | |
| ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN) | |
| df = ds["train"].to_pandas() | |
| except: | |
| df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"]) | |
| if query not in df["Query"].values: | |
| new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False} | |
| df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) | |
| updated_ds = Dataset.from_pandas(df) | |
| updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN) | |
| except Exception as e: | |
| print(f"Failed to save query: {e}") | |
| # Query Groq LLM | |
| def query_groq_llm(prompt, model_name="llama3-70b-8192"): | |
| try: | |
| chat_completion = groq_client.chat.completions.create( | |
| messages=[{ | |
| "role": "user", | |
| "content": prompt | |
| }], | |
| model=model_name, | |
| temperature=0.7, | |
| max_tokens=500 | |
| ) | |
| return chat_completion.choices[0].message.content.strip() | |
| except Exception as e: | |
| print(f"Error querying Groq API: {e}") | |
| return "" | |
| # Main logic function to be called from Gradio | |
| def get_best_answer(user_input): | |
| if not user_input.strip(): | |
| return "Please enter a valid question." | |
| user_input_lower = user_input.lower().strip() | |
| if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS): | |
| return "Please ask your question properly with at least 3 words." | |
| if any(greet in user_input_lower for greet in GREETINGS): | |
| greeting_response = query_groq_llm( | |
| f"You are an official assistant for University of Education Lahore. " | |
| f"Respond to this greeting in a friendly and professional manner: {user_input}" | |
| ) | |
| return greeting_response if greeting_response else "Hello! How can I assist you today?" | |
| if any(keyword in user_input_lower for keyword in ["fee structure", "fees structure", "semester fees", "semester fee"]): | |
| return ( | |
| "π° For complete and up-to-date fee details for this program, we recommend visiting the official University of Education fee structure page.\n" | |
| "You'll find comprehensive information regarding tuition, admission charges, and other applicable fees there.\n" | |
| "π https://ue.edu.pk/allfeestructure.php" | |
| ) | |
| user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True) | |
| similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0] | |
| best_match_idx = similarities.argmax().item() | |
| best_score = similarities[best_match_idx].item() | |
| if best_score < 0.65: | |
| manage_unmatched_queries(user_input) | |
| if best_score >= 0.65: | |
| original_answer = dataset_answers[best_match_idx] | |
| prompt = f"""As an official assistant for University of Education Lahore, provide a clear response: | |
| Question: {user_input} | |
| Original Answer: {original_answer} | |
| Improved Answer:""" | |
| else: | |
| prompt = f"""As an official assistant for University of Education Lahore, provide a helpful response: | |
| Include relevant details about university policies. | |
| If unsure, direct to official channels. | |
| Question: {user_input} | |
| Official Answer:""" | |
| llm_response = query_groq_llm(prompt) | |
| if llm_response: | |
| for marker in ["Improved Answer:", "Official Answer:", "Rephrased Answer:"]: | |
| if marker in llm_response: | |
| return llm_response.split(marker)[-1].strip() | |
| return llm_response | |
| else: | |
| return dataset_answers[best_match_idx] if best_score >= 0.65 else ( | |
| "For official information:\n" | |
| "π +92-42-99262231-33\n" | |
| "βοΈ info@ue.edu.pk\n" | |
| "π https://ue.edu.pk" | |
| ) | |