Spaces:

midrees2806
/

UoeChatbot

Sleeping

App Files Files Community

midrees2806 commited on Jan 8

Commit

63eae27

verified ·

1 Parent(s): 238cf2b

Delete rag.py

Browse files

Files changed (1) hide show

rag.py +0 -144

rag.py DELETED Viewed

@@ -1,144 +0,0 @@
-import json
-import glob
-import os
-import pandas as pd
-import numpy as np
-from datetime import datetime
-from dotenv import load_dotenv
-# Core AI Libraries
-from sentence_transformers import SentenceTransformer, util
-from groq import Groq
-from datasets import load_dataset, Dataset
-# Image/UI Utils (Optional based on your imports)
-from PIL import Image, ImageDraw, ImageFont
-from io import BytesIO
-# 1. INITIALIZATION & CONFIG
-load_dotenv()
-groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
-HF_DATASET_REPO = "midrees2806/unmatched_queries"
-HF_TOKEN = os.getenv("HF_TOKEN")
-# 2. DATA LOADING
-dataset = []
-try:
-    # Get all json files from the datasets folder
-    json_files = glob.glob('datasets/*.json')
-    for file_path in json_files:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-            if isinstance(data, list):
-                for item in data:
-                    if isinstance(item, dict) and 'Question' in item and 'Answer' in item:
-                        dataset.append(item)
-            else:
-                print(f"Skipping {file_path}: Expected a list of dictionaries.")
-except Exception as e:
-    print(f"Error loading datasets: {e}")
-# Precompute embeddings for faster search
-dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
-dataset_answers = [item.get("Answer", "") for item in dataset]
-# Convert dataset to tensors once at startup
-if dataset_questions:
-    dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
-else:
-    dataset_embeddings = None
-    print("Warning: Dataset is empty!")
-# 3. UTILITY FUNCTIONS
-def manage_unmatched_queries(query: str):
-    """Logs queries that didn't meet the similarity threshold to Hugging Face Hub."""
-    try:
-        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        try:
-            ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
-            df = ds["train"].to_pandas()
-        except Exception:
-            df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
-        if query not in df["Query"].values:
-            new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
-            df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
-            updated_ds = Dataset.from_pandas(df)
-            updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
-    except Exception as e:
-        print(f"Failed to save unmatched query: {e}")
-def query_groq_llm(prompt, system_message="You are an official assistant for the University of Education Lahore."):
-    """Sends a prompt to Groq Llama 3 and returns the response."""
-    try:
-        chat_completion = groq_client.chat.completions.create(
-            messages=[
-                {"role": "system", "content": system_message},
-                {"role": "user", "content": prompt}
-            ],
-            model="llama3-70b-8192",
-            temperature=0.6, # Lower temperature for more factual responses
-            max_tokens=800
-        )
-        return chat_completion.choices[0].message.content.strip()
-    except Exception as e:
-        print(f"Error querying Groq: {e}")
-        return None
-# 4. MAIN RAG LOGIC
-def get_best_answer(user_input):
-    # Basic Validation
-    if not user_input.strip():
-        return "Please enter a valid question."
-    user_input_lower = user_input.lower().strip()
-    if len(user_input_lower.split()) < 3:
-        return "Please ask your question with more detail (at least 3 words)."
-    # Special Case: Fee Structure (Direct Link)
-    if any(kw in user_input_lower for kw in ["fee structure", "fees structure", "fee list"]):
-        return (
-            "💰 For complete and up-to-date fee details, please visit the official page:\n"
-            "🔗 https://ue.edu.pk/allfeestructure.php"
-        )
-    # Calculate Similarity
-    if dataset_embeddings is None:
-        return "System is currently updating. Please try again in a moment."
-    user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
-    similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
-    best_match_idx = similarities.argmax().item()
-    best_score = similarities[best_match_idx].item()
-    # DECISION BRIDGE
-    if best_score >= 0.65:
-        # PATH 1: Verified Data Found
-        original_answer = dataset_answers[best_match_idx]
-        prompt = (
-            f"A student asked: '{user_input}'.\n"
-            f"Based on our official records: '{original_answer}'.\n"
-            "Please rewrite this into a friendly, professional response."
-        )
-        system_role = "You are an official University assistant. Use ONLY the provided records."
-    else:
-        # PATH 2: No Match - Fallback to General Knowledge & Log
-        manage_unmatched_queries(user_input)
-        prompt = (
-            f"A student asked: '{user_input}'.\n"
-            "I don't have a specific record for this. Provide a general helpful response "
-            "based on university standards. If you are unsure about specific dates or costs, "
-            "direct them to contact info@ue.edu.pk."
-        )
-        system_role = "You are a helpful University assistant. Direct unknown specific queries to official channels."
-    # Final Generation
-    llm_response = query_groq_llm(prompt, system_message=system_role)
-    # Final Fallback in case API fails
-    if not llm_response:
-        return original_answer if best_score >= 0.65 else "Please contact the university at info@ue.edu.pk for assistance."
-    return llm_response