import json import os import requests from vector_store import VectorStore from typing import List, Dict # GitHub repository configuration BASE_URL = "https://raw.githubusercontent.com/Atkiya/RasaChatbot/main/" # COMPLETE Data source files from your GitHub repository GITHUB_DATA_SOURCES = { # ======================================== # DYNAMIC FILES (Updated frequently) # ======================================== "admission_calendar": "dynamic_admission_calendar.json", "admission_process": "dynamic_admission_process.json", "admission_requirements": "dynamic_admission_requirements.json", "tuition_fees": "dynamic_tution_fees.json", "events_workshops": "dynamic_events_workshops.json", "faculty": "dynamic_faculty.json", "grading": "dynamic_grading.json", "facilities": "dynamic_facilites.json", # ======================================== # STATIC FILES (General info) # ======================================== "about_ewu": "static_aboutEWU.json", "admin": "static_Admin.json", "all_programs": "static_AllAvailablePrograms.json", "campus_life": "static_campus_life.json", "career_counseling": "static_Career_Counseling_Center.json", "clubs": "static_clubs.json", "departments": "static_depts.json", "facilities_static": "static_facilities.json", "facilities17": "static_facilities17.json", "helpdesk": "static_helpdesk.json", "payment_procedure": "static_payment_procedure.json", "policy": "static_Policy.json", "programs": "static_Programs.json", "rules": "static_Rules.json", "scholarships": "static_scholarship_and_financial.json", "sexual_harassment": "static_Sexual_harassment.json", "tuition_fees_static": "static_Tuition_fees.json", # ======================================== # GRADUATE PROGRAMS (Master's, PhD) # ======================================== "ma_english": "ma_english.json", "mba_emba": "mba_emba.json", "mds": "mds.json", "mphil_pharmacy": "mphil_pharmacy.json", "mss_economics": "mss_eco.json", "ms_cse": "ms_cse.json", "ms_dsa": "ms_dsa.json", "tesol": "tesol.json", # ======================================== # UNDERGRADUATE PROGRAMS (Bachelor's) # ======================================== "st_ba": "st_ba.json", "st_ce": "st_ce.json", "st_cse": "st_cse.json", "st_ece": "st_ece.json", "st_economics": "st_economics.json", "st_eee": "st_eee.json", "st_english": "st_english.json", "st_geb": "st_geb.json", "st_information_studies": "st_information_studies.json", "st_law": "st_law.json", "st_math": "st_math.json", "st_pharmacy": "st_pharmacy.json", "st_social_relations": "st_social_relations.json", "st_sociology": "st_sociology.json", } # ============================================================================ # UTILITY FUNCTIONS # ============================================================================ def load_from_github(filename: str) -> dict: """Load JSON data from GitHub repository""" try: url = BASE_URL + filename print(f" šŸ“„ Fetching: {filename}...", end=" ") response = requests.get(url, timeout=10) if response.status_code == 200: print("āœ…") return response.json() else: print(f"āŒ (Status: {response.status_code})") return None except Exception as e: print(f"āŒ ({str(e)[:50]})") return None def flatten_dict(d: dict, parent_key: str = '', sep: str = ' > ') -> str: """Recursively flatten a dictionary into readable text""" items = [] for k, v in d.items(): new_key = f"{parent_key}{sep}{k}" if parent_key else k if isinstance(v, dict): items.append(f"\n{new_key.upper()}:") items.append(flatten_dict(v, new_key, sep)) elif isinstance(v, list): items.append(f"\n{new_key.upper()}:") for i, item in enumerate(v, 1): if isinstance(item, dict): items.append(f"\n [{i}]") items.append(flatten_dict(item, '', sep)) else: items.append(f" - {item}") else: items.append(f"{new_key}: {v}") return "\n".join(items) # ============================================================================ # SPECIFIC PROCESSORS # ============================================================================ def process_tuition_fees(data: dict) -> List[Dict]: """Process tuition fees data into document chunks""" documents = [] if not data: return documents # Process undergraduate programs if "undergraduate_programs" in data: # Per credit fees if "tuition_fees_per_credit" in data["undergraduate_programs"]: for program in data["undergraduate_programs"]["tuition_fees_per_credit"]: doc = { "content": f"""Program: {program['program']} Fee Per Credit: {program['fee_per_credit']} BDT/credit Program Type: Undergraduate Applicable From: {data.get('page_info', {}).get('applicable_from', 'N/A')}""", "source": "tuition_fees.json", "metadata": { "type": "tuition_per_credit", "program": program['program'], "level": "undergraduate" } } documents.append(doc) # Detailed fee structure if "detailed_fee_structure" in data["undergraduate_programs"]: for program in data["undergraduate_programs"]["detailed_fee_structure"]: doc = { "content": f"""Program: {program['program']} Total Tuition Fee: {program['tuition_fees']} BDT Total Credits: {program['credits']} Grand Total Program Cost: {program['grand_total']} BDT Program Level: Undergraduate""", "source": "tuition_fees.json", "metadata": { "type": "tuition_detailed", "program": program['program'], "level": "undergraduate" } } documents.append(doc) # Process graduate programs if "graduate_programs" in data: if "detailed_fee_structure" in data["graduate_programs"]: for program in data["graduate_programs"]["detailed_fee_structure"]: doc = { "content": f"""Program: {program['program']} Total Tuition Fee: {program['tuition_fees']} BDT Total Credits: {program['credits']} Grand Total Program Cost: {program['grand_total']} BDT Program Level: Graduate""", "source": "tuition_fees.json", "metadata": { "type": "tuition_detailed", "program": program['program'], "level": "graduate" } } documents.append(doc) # Process fee categories if "fee_categories" in data: for fee_name, fee_value in data["fee_categories"].items(): doc = { "content": f"""{fee_name.replace('_', ' ').title()}: {fee_value} Fee Type: {fee_name}""", "source": "tuition_fees.json", "metadata": { "type": "fee_category", "fee_name": fee_name } } documents.append(doc) return documents def process_admission_calendar(data: dict) -> List[Dict]: """Process admission calendar/deadlines""" documents = [] if not data: return documents # Undergraduate admission deadlines if "undergraduate_admission" in data: for program in data["undergraduate_admission"]: doc = { "content": f"""Program: {program['program']} Application Deadline: {program['application_deadline']} Admission Test Date: {program['admission_test']} Semester: {data.get('page_info', {}).get('semester', 'N/A')} Program Level: Undergraduate""", "source": "admission_calendar.json", "metadata": { "type": "admission_deadline", "program": program['program'], "level": "undergraduate" } } documents.append(doc) # Graduate admission deadlines if "graduate_admission" in data: for program in data["graduate_admission"]: doc = { "content": f"""Program: {program['program']} Application Deadline: {program['application_deadline']} Admission Test Date: {program['admission_test']} Semester: {data.get('page_info', {}).get('semester', 'N/A')} Program Level: Graduate""", "source": "admission_calendar.json", "metadata": { "type": "admission_deadline", "program": program['program'], "level": "graduate" } } documents.append(doc) return documents def process_admission_process(data: dict) -> List[Dict]: """Process admission process/procedures""" documents = [] if not data: return documents content = flatten_dict(data) doc = { "content": f"""Admission Process Information: {content}""", "source": "admission_process.json", "metadata": { "type": "admission_process" } } documents.append(doc) return documents def process_admission_requirements(data: dict) -> List[Dict]: """Process admission requirements""" documents = [] if not data or "admission_requirements" not in data: return documents reqs = data["admission_requirements"] # Undergraduate requirements (general) if "undergraduate" in reqs and "general_programs_except_bpharm" in reqs["undergraduate"]: ug = reqs["undergraduate"]["general_programs_except_bpharm"] doc = { "content": f"""Undergraduate Admission Requirements (General Programs except B.Pharm): Academic Requirements: - SSC/HSC: {ug.get('ssc_hsc', 'N/A')} - Diploma: {ug.get('diploma', 'N/A')} - O/A Levels: {ug.get('o_a_levels_requirement', 'N/A')} Admission Test Weightage: - Admission Test: {ug.get('admission_test_weightage', {}).get('admission_test', 'N/A')} - SSC/O Level: {ug.get('admission_test_weightage', {}).get('ssc_o_level', 'N/A')} - HSC/A Level: {ug.get('admission_test_weightage', {}).get('hsc_a_level', 'N/A')} Subject Requirements: - CSE: {ug.get('subject_requirements', {}).get('cse', 'N/A')}""", "source": "admission_requirements.json", "metadata": { "type": "admission_requirements", "level": "undergraduate", "program_type": "general" } } documents.append(doc) # B.Pharm specific requirements if "undergraduate" in reqs and "b_pharm" in reqs["undergraduate"]: pharm = reqs["undergraduate"]["b_pharm"] doc = { "content": f"""B.Pharm (Bachelor of Pharmacy) Admission Requirements: - Citizenship: {pharm.get('citizenship', 'N/A')} - SSC+HSC Aggregate: {pharm.get('ssc_hsc_aggregate', 'N/A')} - SSC+HSC Minimum Each: {pharm.get('ssc_hsc_minimum_each', 'N/A')} - Year of Passing: {pharm.get('year_of_pass', 'N/A')} Subject GPA Requirements: - Chemistry: {pharm.get('subject_gpa', {}).get('chemistry', 'N/A')} - Biology: {pharm.get('subject_gpa', {}).get('biology', 'N/A')} - Physics: {pharm.get('subject_gpa', {}).get('physics', 'N/A')} - Mathematics: {pharm.get('subject_gpa', {}).get('mathematics', 'N/A')} Special Note: {pharm.get('special_note', '')}""", "source": "admission_requirements.json", "metadata": { "type": "admission_requirements", "level": "undergraduate", "program": "B.Pharm" } } documents.append(doc) # Graduate requirements (MBA/EMBA) if "graduate" in reqs and "mba_emba" in reqs["graduate"]: mba = reqs["graduate"]["mba_emba"] doc = { "content": f"""MBA/EMBA Admission Requirements: - Degree: {mba.get('degree', 'N/A')} - SSC+HSC+Graduate GPA: {mba.get('ssc_hsc_graduate_gpa', 'N/A')} Work Experience: - MBA: {mba.get('mba_work_experience', 'N/A')} - EMBA: {mba.get('emba_work_experience', 'N/A')} Test Exemptions: - EWU Graduates: {mba.get('test_exemptions', {}).get('ewu_graduates', 'N/A')} - Other Universities: {mba.get('test_exemptions', {}).get('other_universities', 'N/A')}""", "source": "admission_requirements.json", "metadata": { "type": "admission_requirements", "level": "graduate", "program": "MBA/EMBA" } } documents.append(doc) return documents def process_facilities(data: dict) -> List[Dict]: """Process facilities information""" documents = [] if not data or "facilities" not in data: return documents facilities = data["facilities"] # Campus life facilities if "campus_life" in facilities and "available" in facilities["campus_life"]: for facility in facilities["campus_life"]["available"]: doc = { "content": f"""Facility: {facility['name']} Description: {facility['description']} Category: Campus Life""", "source": "facilities.json", "metadata": { "type": "facility", "facility_name": facility['name'] } } documents.append(doc) # Engineering labs if "engineering_labs" in facilities: labs_info = facilities["engineering_labs"] labs_content = f"""Engineering Laboratories at EWU Departments: {', '.join(labs_info.get('departments', []))} Available Labs: """ for lab in labs_info.get('labs', []): labs_content += f"- {lab['name']}\n" doc = { "content": labs_content, "source": "facilities.json", "metadata": { "type": "facility", "category": "engineering_labs" } } documents.append(doc) return documents def process_faculty(data: dict) -> List[Dict]: """Process faculty information""" documents = [] if not data or "faculty" not in data: return documents # Process each department's faculty for dept_key, dept_data in data["faculty"].items(): if isinstance(dept_data, dict) and "members" in dept_data: dept_name = dept_data.get("department_name", dept_key) for member in dept_data["members"]: doc = { "content": f"""Faculty Member: {member.get('name', 'N/A')} Department: {dept_name} Designation: {member.get('designation', 'N/A')} Specialization: {member.get('specialization', 'N/A')} Email: {member.get('email', 'N/A')} Office: {member.get('office', 'N/A')}""", "source": "faculty.json", "metadata": { "type": "faculty", "department": dept_name, "name": member.get('name', 'Unknown') } } documents.append(doc) return documents def process_events(data: dict) -> List[Dict]: """Process events and workshops""" documents = [] if not data or "events" not in data: return documents for event in data["events"]: doc = { "content": f"""Event: {event.get('title', 'N/A')} Date: {event.get('date', 'N/A')} Description: {event.get('description', 'N/A')} Organizer: {event.get('organizer', 'N/A')} Venue: {event.get('venue', 'N/A')}""", "source": "events_workshops.json", "metadata": { "type": "event", "title": event.get('title', 'Unknown') } } documents.append(doc) return documents def process_grading(data: dict) -> List[Dict]: """Process grading system information""" documents = [] if not data or "grading_system" not in data: return documents grading = data["grading_system"] # Main grading system info content = f"""{grading.get('title', 'Grading System')} {grading.get('description', '')} Grade Scale: """ for grade in grading.get('grade_scale', []): content += f"- {grade.get('letter_grade', '')}: {grade.get('numerical_score', '')} (Grade Point: {grade.get('grade_point', '')})\n" content += "\nSpecial Grades:\n" for spec_grade in grading.get('special_grades', []): content += f"- {spec_grade.get('grade', '')}: {spec_grade.get('description', '')}\n" doc = { "content": content, "source": "grading.json", "metadata": { "type": "grading_system" } } documents.append(doc) return documents # ============================================================================ # GENERIC PROCESSORS # ============================================================================ def process_generic(data: dict, source_name: str, category: str) -> List[Dict]: """Universal generic processor for any JSON file""" documents = [] if not data: return documents content = flatten_dict(data) doc = { "content": f"""{source_name.replace('_', ' ').title()} Information: {content}""", "source": f"{source_name}.json", "metadata": { "type": category, "source": source_name } } documents.append(doc) return documents # ============================================================================ # MAIN FUNCTION # ============================================================================ def main(): print("="*70) print("šŸ”Ø EWU RAG KNOWLEDGE BASE BUILDER") print("="*70) print(f"šŸ“Š Total files to process: {len(GITHUB_DATA_SOURCES)}") # Initialize vector store print("\nšŸ“¦ Initializing vector store...") vector_store = VectorStore( index_path="./data/faiss_index", embedding_model="sentence-transformers/all-MiniLM-L6-v2" ) all_documents = [] # Specific processors (for complex structured data) specific_processors = { "tuition_fees": process_tuition_fees, "admission_calendar": process_admission_calendar, "admission_process": process_admission_process, "admission_requirements": process_admission_requirements, "facilities": process_facilities, "faculty": process_faculty, "events_workshops": process_events, "grading": process_grading, } # Load and process each data source from GitHub print("\nšŸ“š Fetching data from GitHub repository...\n") success_count = 0 fail_count = 0 for source_name, filename in GITHUB_DATA_SOURCES.items(): # Load data from GitHub data = load_from_github(filename) if data: try: # Use specific processor if available, otherwise use generic if source_name in specific_processors: docs = specific_processors[source_name](data) else: # Determine category if filename.startswith("static_"): category = "static_info" elif filename.startswith("dynamic_"): category = "dynamic_info" elif filename.startswith("st_"): category = "undergraduate_program" elif filename.startswith("m"): category = "graduate_program" else: category = "general_info" docs = process_generic(data, source_name, category) all_documents.extend(docs) success_count += 1 print(f" āœ“ {source_name}: {len(docs)} document(s)") except Exception as e: fail_count += 1 print(f" āœ— {source_name}: Error - {str(e)[:60]}") else: fail_count += 1 # Add documents to vector store if all_documents: print(f"\n{'='*70}") print(f"šŸ“¦ Adding {len(all_documents)} documents to vector store...") print(f"ā³ This may take 1-2 minutes...") vector_store.add_documents(all_documents) vector_store.save_index() print(f"āœ… Knowledge base successfully created!") print(f"{'='*70}") # Summary print(f"\nšŸ“Š Processing Summary:") print(f" āœ… Successfully processed: {success_count}/{len(GITHUB_DATA_SOURCES)} files") print(f" āŒ Failed: {fail_count}/{len(GITHUB_DATA_SOURCES)} files") print(f" šŸ“„ Total documents: {len(all_documents)}") print(f"\nšŸ“Š Document Type Breakdown:") type_counts = {} for doc in all_documents: doc_type = doc['metadata'].get('type', 'unknown') type_counts[doc_type] = type_counts.get(doc_type, 0) + 1 for doc_type, count in sorted(type_counts.items()): print(f" • {doc_type}: {count}") print(f"\nšŸ’¾ Index saved to: ./data/faiss_index") print(f"šŸ“ Files:") print(f" • index.faiss (vector index)") print(f" • documents.json (document metadata)") print(f"\nšŸš€ Ready to start RAG server!") print(f" Command: python rag_server.py\n") else: print("\nāš ļø WARNING: No documents were processed!") print(" Check:") print(" 1. Network connection") print(" 2. GitHub repository URL") print(" 3. File names in GITHUB_DATA_SOURCES\n") if __name__ == "__main__": main()