rag_server / prepare_knowledge_base.py
atkiya110's picture
Upload 9 files
077b8ab verified
raw
history blame
23.1 kB
import json
import os
import requests
from vector_store import VectorStore
from typing import List, Dict
# GitHub repository configuration
BASE_URL = "https://raw.githubusercontent.com/Atkiya/RasaChatbot/main/"
# COMPLETE Data source files from your GitHub repository
GITHUB_DATA_SOURCES = {
# ========================================
# DYNAMIC FILES (Updated frequently)
# ========================================
"admission_calendar": "dynamic_admission_calendar.json",
"admission_process": "dynamic_admission_process.json",
"admission_requirements": "dynamic_admission_requirements.json",
"tuition_fees": "dynamic_tution_fees.json",
"events_workshops": "dynamic_events_workshops.json",
"faculty": "dynamic_faculty.json",
"grading": "dynamic_grading.json",
"facilities": "dynamic_facilites.json",
# ========================================
# STATIC FILES (General info)
# ========================================
"about_ewu": "static_aboutEWU.json",
"admin": "static_Admin.json",
"all_programs": "static_AllAvailablePrograms.json",
"campus_life": "static_campus_life.json",
"career_counseling": "static_Career_Counseling_Center.json",
"clubs": "static_clubs.json",
"departments": "static_depts.json",
"facilities_static": "static_facilities.json",
"facilities17": "static_facilities17.json",
"helpdesk": "static_helpdesk.json",
"payment_procedure": "static_payment_procedure.json",
"policy": "static_Policy.json",
"programs": "static_Programs.json",
"rules": "static_Rules.json",
"scholarships": "static_scholarship_and_financial.json",
"sexual_harassment": "static_Sexual_harassment.json",
"tuition_fees_static": "static_Tuition_fees.json",
# ========================================
# GRADUATE PROGRAMS (Master's, PhD)
# ========================================
"ma_english": "ma_english.json",
"mba_emba": "mba_emba.json",
"mds": "mds.json",
"mphil_pharmacy": "mphil_pharmacy.json",
"mss_economics": "mss_eco.json",
"ms_cse": "ms_cse.json",
"ms_dsa": "ms_dsa.json",
"tesol": "tesol.json",
# ========================================
# UNDERGRADUATE PROGRAMS (Bachelor's)
# ========================================
"st_ba": "st_ba.json",
"st_ce": "st_ce.json",
"st_cse": "st_cse.json",
"st_ece": "st_ece.json",
"st_economics": "st_economics.json",
"st_eee": "st_eee.json",
"st_english": "st_english.json",
"st_geb": "st_geb.json",
"st_information_studies": "st_information_studies.json",
"st_law": "st_law.json",
"st_math": "st_math.json",
"st_pharmacy": "st_pharmacy.json",
"st_social_relations": "st_social_relations.json",
"st_sociology": "st_sociology.json",
}
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================
def load_from_github(filename: str) -> dict:
"""Load JSON data from GitHub repository"""
try:
url = BASE_URL + filename
print(f" πŸ“₯ Fetching: {filename}...", end=" ")
response = requests.get(url, timeout=10)
if response.status_code == 200:
print("βœ…")
return response.json()
else:
print(f"❌ (Status: {response.status_code})")
return None
except Exception as e:
print(f"❌ ({str(e)[:50]})")
return None
def flatten_dict(d: dict, parent_key: str = '', sep: str = ' > ') -> str:
"""Recursively flatten a dictionary into readable text"""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.append(f"\n{new_key.upper()}:")
items.append(flatten_dict(v, new_key, sep))
elif isinstance(v, list):
items.append(f"\n{new_key.upper()}:")
for i, item in enumerate(v, 1):
if isinstance(item, dict):
items.append(f"\n [{i}]")
items.append(flatten_dict(item, '', sep))
else:
items.append(f" - {item}")
else:
items.append(f"{new_key}: {v}")
return "\n".join(items)
# ============================================================================
# SPECIFIC PROCESSORS
# ============================================================================
def process_tuition_fees(data: dict) -> List[Dict]:
"""Process tuition fees data into document chunks"""
documents = []
if not data:
return documents
# Process undergraduate programs
if "undergraduate_programs" in data:
# Per credit fees
if "tuition_fees_per_credit" in data["undergraduate_programs"]:
for program in data["undergraduate_programs"]["tuition_fees_per_credit"]:
doc = {
"content": f"""Program: {program['program']}
Fee Per Credit: {program['fee_per_credit']} BDT/credit
Program Type: Undergraduate
Applicable From: {data.get('page_info', {}).get('applicable_from', 'N/A')}""",
"source": "tuition_fees.json",
"metadata": {
"type": "tuition_per_credit",
"program": program['program'],
"level": "undergraduate"
}
}
documents.append(doc)
# Detailed fee structure
if "detailed_fee_structure" in data["undergraduate_programs"]:
for program in data["undergraduate_programs"]["detailed_fee_structure"]:
doc = {
"content": f"""Program: {program['program']}
Total Tuition Fee: {program['tuition_fees']} BDT
Total Credits: {program['credits']}
Grand Total Program Cost: {program['grand_total']} BDT
Program Level: Undergraduate""",
"source": "tuition_fees.json",
"metadata": {
"type": "tuition_detailed",
"program": program['program'],
"level": "undergraduate"
}
}
documents.append(doc)
# Process graduate programs
if "graduate_programs" in data:
if "detailed_fee_structure" in data["graduate_programs"]:
for program in data["graduate_programs"]["detailed_fee_structure"]:
doc = {
"content": f"""Program: {program['program']}
Total Tuition Fee: {program['tuition_fees']} BDT
Total Credits: {program['credits']}
Grand Total Program Cost: {program['grand_total']} BDT
Program Level: Graduate""",
"source": "tuition_fees.json",
"metadata": {
"type": "tuition_detailed",
"program": program['program'],
"level": "graduate"
}
}
documents.append(doc)
# Process fee categories
if "fee_categories" in data:
for fee_name, fee_value in data["fee_categories"].items():
doc = {
"content": f"""{fee_name.replace('_', ' ').title()}: {fee_value}
Fee Type: {fee_name}""",
"source": "tuition_fees.json",
"metadata": {
"type": "fee_category",
"fee_name": fee_name
}
}
documents.append(doc)
return documents
def process_admission_calendar(data: dict) -> List[Dict]:
"""Process admission calendar/deadlines"""
documents = []
if not data:
return documents
# Undergraduate admission deadlines
if "undergraduate_admission" in data:
for program in data["undergraduate_admission"]:
doc = {
"content": f"""Program: {program['program']}
Application Deadline: {program['application_deadline']}
Admission Test Date: {program['admission_test']}
Semester: {data.get('page_info', {}).get('semester', 'N/A')}
Program Level: Undergraduate""",
"source": "admission_calendar.json",
"metadata": {
"type": "admission_deadline",
"program": program['program'],
"level": "undergraduate"
}
}
documents.append(doc)
# Graduate admission deadlines
if "graduate_admission" in data:
for program in data["graduate_admission"]:
doc = {
"content": f"""Program: {program['program']}
Application Deadline: {program['application_deadline']}
Admission Test Date: {program['admission_test']}
Semester: {data.get('page_info', {}).get('semester', 'N/A')}
Program Level: Graduate""",
"source": "admission_calendar.json",
"metadata": {
"type": "admission_deadline",
"program": program['program'],
"level": "graduate"
}
}
documents.append(doc)
return documents
def process_admission_process(data: dict) -> List[Dict]:
"""Process admission process/procedures"""
documents = []
if not data:
return documents
content = flatten_dict(data)
doc = {
"content": f"""Admission Process Information:
{content}""",
"source": "admission_process.json",
"metadata": {
"type": "admission_process"
}
}
documents.append(doc)
return documents
def process_admission_requirements(data: dict) -> List[Dict]:
"""Process admission requirements"""
documents = []
if not data or "admission_requirements" not in data:
return documents
reqs = data["admission_requirements"]
# Undergraduate requirements (general)
if "undergraduate" in reqs and "general_programs_except_bpharm" in reqs["undergraduate"]:
ug = reqs["undergraduate"]["general_programs_except_bpharm"]
doc = {
"content": f"""Undergraduate Admission Requirements (General Programs except B.Pharm):
Academic Requirements:
- SSC/HSC: {ug.get('ssc_hsc', 'N/A')}
- Diploma: {ug.get('diploma', 'N/A')}
- O/A Levels: {ug.get('o_a_levels_requirement', 'N/A')}
Admission Test Weightage:
- Admission Test: {ug.get('admission_test_weightage', {}).get('admission_test', 'N/A')}
- SSC/O Level: {ug.get('admission_test_weightage', {}).get('ssc_o_level', 'N/A')}
- HSC/A Level: {ug.get('admission_test_weightage', {}).get('hsc_a_level', 'N/A')}
Subject Requirements:
- CSE: {ug.get('subject_requirements', {}).get('cse', 'N/A')}""",
"source": "admission_requirements.json",
"metadata": {
"type": "admission_requirements",
"level": "undergraduate",
"program_type": "general"
}
}
documents.append(doc)
# B.Pharm specific requirements
if "undergraduate" in reqs and "b_pharm" in reqs["undergraduate"]:
pharm = reqs["undergraduate"]["b_pharm"]
doc = {
"content": f"""B.Pharm (Bachelor of Pharmacy) Admission Requirements:
- Citizenship: {pharm.get('citizenship', 'N/A')}
- SSC+HSC Aggregate: {pharm.get('ssc_hsc_aggregate', 'N/A')}
- SSC+HSC Minimum Each: {pharm.get('ssc_hsc_minimum_each', 'N/A')}
- Year of Passing: {pharm.get('year_of_pass', 'N/A')}
Subject GPA Requirements:
- Chemistry: {pharm.get('subject_gpa', {}).get('chemistry', 'N/A')}
- Biology: {pharm.get('subject_gpa', {}).get('biology', 'N/A')}
- Physics: {pharm.get('subject_gpa', {}).get('physics', 'N/A')}
- Mathematics: {pharm.get('subject_gpa', {}).get('mathematics', 'N/A')}
Special Note: {pharm.get('special_note', '')}""",
"source": "admission_requirements.json",
"metadata": {
"type": "admission_requirements",
"level": "undergraduate",
"program": "B.Pharm"
}
}
documents.append(doc)
# Graduate requirements (MBA/EMBA)
if "graduate" in reqs and "mba_emba" in reqs["graduate"]:
mba = reqs["graduate"]["mba_emba"]
doc = {
"content": f"""MBA/EMBA Admission Requirements:
- Degree: {mba.get('degree', 'N/A')}
- SSC+HSC+Graduate GPA: {mba.get('ssc_hsc_graduate_gpa', 'N/A')}
Work Experience:
- MBA: {mba.get('mba_work_experience', 'N/A')}
- EMBA: {mba.get('emba_work_experience', 'N/A')}
Test Exemptions:
- EWU Graduates: {mba.get('test_exemptions', {}).get('ewu_graduates', 'N/A')}
- Other Universities: {mba.get('test_exemptions', {}).get('other_universities', 'N/A')}""",
"source": "admission_requirements.json",
"metadata": {
"type": "admission_requirements",
"level": "graduate",
"program": "MBA/EMBA"
}
}
documents.append(doc)
return documents
def process_facilities(data: dict) -> List[Dict]:
"""Process facilities information"""
documents = []
if not data or "facilities" not in data:
return documents
facilities = data["facilities"]
# Campus life facilities
if "campus_life" in facilities and "available" in facilities["campus_life"]:
for facility in facilities["campus_life"]["available"]:
doc = {
"content": f"""Facility: {facility['name']}
Description: {facility['description']}
Category: Campus Life""",
"source": "facilities.json",
"metadata": {
"type": "facility",
"facility_name": facility['name']
}
}
documents.append(doc)
# Engineering labs
if "engineering_labs" in facilities:
labs_info = facilities["engineering_labs"]
labs_content = f"""Engineering Laboratories at EWU
Departments: {', '.join(labs_info.get('departments', []))}
Available Labs:
"""
for lab in labs_info.get('labs', []):
labs_content += f"- {lab['name']}\n"
doc = {
"content": labs_content,
"source": "facilities.json",
"metadata": {
"type": "facility",
"category": "engineering_labs"
}
}
documents.append(doc)
return documents
def process_faculty(data: dict) -> List[Dict]:
"""Process faculty information"""
documents = []
if not data or "faculty" not in data:
return documents
# Process each department's faculty
for dept_key, dept_data in data["faculty"].items():
if isinstance(dept_data, dict) and "members" in dept_data:
dept_name = dept_data.get("department_name", dept_key)
for member in dept_data["members"]:
doc = {
"content": f"""Faculty Member: {member.get('name', 'N/A')}
Department: {dept_name}
Designation: {member.get('designation', 'N/A')}
Specialization: {member.get('specialization', 'N/A')}
Email: {member.get('email', 'N/A')}
Office: {member.get('office', 'N/A')}""",
"source": "faculty.json",
"metadata": {
"type": "faculty",
"department": dept_name,
"name": member.get('name', 'Unknown')
}
}
documents.append(doc)
return documents
def process_events(data: dict) -> List[Dict]:
"""Process events and workshops"""
documents = []
if not data or "events" not in data:
return documents
for event in data["events"]:
doc = {
"content": f"""Event: {event.get('title', 'N/A')}
Date: {event.get('date', 'N/A')}
Description: {event.get('description', 'N/A')}
Organizer: {event.get('organizer', 'N/A')}
Venue: {event.get('venue', 'N/A')}""",
"source": "events_workshops.json",
"metadata": {
"type": "event",
"title": event.get('title', 'Unknown')
}
}
documents.append(doc)
return documents
def process_grading(data: dict) -> List[Dict]:
"""Process grading system information"""
documents = []
if not data or "grading_system" not in data:
return documents
grading = data["grading_system"]
# Main grading system info
content = f"""{grading.get('title', 'Grading System')}
{grading.get('description', '')}
Grade Scale:
"""
for grade in grading.get('grade_scale', []):
content += f"- {grade.get('letter_grade', '')}: {grade.get('numerical_score', '')} (Grade Point: {grade.get('grade_point', '')})\n"
content += "\nSpecial Grades:\n"
for spec_grade in grading.get('special_grades', []):
content += f"- {spec_grade.get('grade', '')}: {spec_grade.get('description', '')}\n"
doc = {
"content": content,
"source": "grading.json",
"metadata": {
"type": "grading_system"
}
}
documents.append(doc)
return documents
# ============================================================================
# GENERIC PROCESSORS
# ============================================================================
def process_generic(data: dict, source_name: str, category: str) -> List[Dict]:
"""Universal generic processor for any JSON file"""
documents = []
if not data:
return documents
content = flatten_dict(data)
doc = {
"content": f"""{source_name.replace('_', ' ').title()} Information:
{content}""",
"source": f"{source_name}.json",
"metadata": {
"type": category,
"source": source_name
}
}
documents.append(doc)
return documents
# ============================================================================
# MAIN FUNCTION
# ============================================================================
def main():
print("="*70)
print("πŸ”¨ EWU RAG KNOWLEDGE BASE BUILDER")
print("="*70)
print(f"πŸ“Š Total files to process: {len(GITHUB_DATA_SOURCES)}")
# Initialize vector store
print("\nπŸ“¦ Initializing vector store...")
vector_store = VectorStore(
index_path="./data/faiss_index",
embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)
all_documents = []
# Specific processors (for complex structured data)
specific_processors = {
"tuition_fees": process_tuition_fees,
"admission_calendar": process_admission_calendar,
"admission_process": process_admission_process,
"admission_requirements": process_admission_requirements,
"facilities": process_facilities,
"faculty": process_faculty,
"events_workshops": process_events,
"grading": process_grading,
}
# Load and process each data source from GitHub
print("\nπŸ“š Fetching data from GitHub repository...\n")
success_count = 0
fail_count = 0
for source_name, filename in GITHUB_DATA_SOURCES.items():
# Load data from GitHub
data = load_from_github(filename)
if data:
try:
# Use specific processor if available, otherwise use generic
if source_name in specific_processors:
docs = specific_processors[source_name](data)
else:
# Determine category
if filename.startswith("static_"):
category = "static_info"
elif filename.startswith("dynamic_"):
category = "dynamic_info"
elif filename.startswith("st_"):
category = "undergraduate_program"
elif filename.startswith("m"):
category = "graduate_program"
else:
category = "general_info"
docs = process_generic(data, source_name, category)
all_documents.extend(docs)
success_count += 1
print(f" βœ“ {source_name}: {len(docs)} document(s)")
except Exception as e:
fail_count += 1
print(f" βœ— {source_name}: Error - {str(e)[:60]}")
else:
fail_count += 1
# Add documents to vector store
if all_documents:
print(f"\n{'='*70}")
print(f"πŸ“¦ Adding {len(all_documents)} documents to vector store...")
print(f"⏳ This may take 1-2 minutes...")
vector_store.add_documents(all_documents)
vector_store.save_index()
print(f"βœ… Knowledge base successfully created!")
print(f"{'='*70}")
# Summary
print(f"\nπŸ“Š Processing Summary:")
print(f" βœ… Successfully processed: {success_count}/{len(GITHUB_DATA_SOURCES)} files")
print(f" ❌ Failed: {fail_count}/{len(GITHUB_DATA_SOURCES)} files")
print(f" πŸ“„ Total documents: {len(all_documents)}")
print(f"\nπŸ“Š Document Type Breakdown:")
type_counts = {}
for doc in all_documents:
doc_type = doc['metadata'].get('type', 'unknown')
type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
for doc_type, count in sorted(type_counts.items()):
print(f" β€’ {doc_type}: {count}")
print(f"\nπŸ’Ύ Index saved to: ./data/faiss_index")
print(f"πŸ“ Files:")
print(f" β€’ index.faiss (vector index)")
print(f" β€’ documents.json (document metadata)")
print(f"\nπŸš€ Ready to start RAG server!")
print(f" Command: python rag_server.py\n")
else:
print("\n⚠️ WARNING: No documents were processed!")
print(" Check:")
print(" 1. Network connection")
print(" 2. GitHub repository URL")
print(" 3. File names in GITHUB_DATA_SOURCES\n")
if __name__ == "__main__":
main()