Spaces:
Running
Running
| import json | |
| import os | |
| import requests | |
| from vector_store import VectorStore | |
| from typing import List, Dict | |
| # GitHub repository configuration | |
| BASE_URL = "https://raw.githubusercontent.com/Atkiya/RasaChatbot/main/" | |
| # COMPLETE Data source files from your GitHub repository | |
| GITHUB_DATA_SOURCES = { | |
| # ======================================== | |
| # DYNAMIC FILES (Updated frequently) | |
| # ======================================== | |
| "admission_calendar": "dynamic_admission_calendar.json", | |
| "admission_process": "dynamic_admission_process.json", | |
| "admission_requirements": "dynamic_admission_requirements.json", | |
| "tuition_fees": "dynamic_tution_fees.json", | |
| "events_workshops": "dynamic_events_workshops.json", | |
| "faculty": "dynamic_faculty.json", | |
| "grading": "dynamic_grading.json", | |
| "facilities": "dynamic_facilites.json", | |
| # ======================================== | |
| # STATIC FILES (General info) | |
| # ======================================== | |
| "about_ewu": "static_aboutEWU.json", | |
| "admin": "static_Admin.json", | |
| "all_programs": "static_AllAvailablePrograms.json", | |
| "campus_life": "static_campus_life.json", | |
| "career_counseling": "static_Career_Counseling_Center.json", | |
| "clubs": "static_clubs.json", | |
| "departments": "static_depts.json", | |
| "facilities_static": "static_facilities.json", | |
| "facilities17": "static_facilities17.json", | |
| "helpdesk": "static_helpdesk.json", | |
| "payment_procedure": "static_payment_procedure.json", | |
| "policy": "static_Policy.json", | |
| "programs": "static_Programs.json", | |
| "rules": "static_Rules.json", | |
| "scholarships": "static_scholarship_and_financial.json", | |
| "sexual_harassment": "static_Sexual_harassment.json", | |
| "tuition_fees_static": "static_Tuition_fees.json", | |
| # ======================================== | |
| # GRADUATE PROGRAMS (Master's, PhD) | |
| # ======================================== | |
| "ma_english": "ma_english.json", | |
| "mba_emba": "mba_emba.json", | |
| "mds": "mds.json", | |
| "mphil_pharmacy": "mphil_pharmacy.json", | |
| "mss_economics": "mss_eco.json", | |
| "ms_cse": "ms_cse.json", | |
| "ms_dsa": "ms_dsa.json", | |
| "tesol": "tesol.json", | |
| # ======================================== | |
| # UNDERGRADUATE PROGRAMS (Bachelor's) | |
| # ======================================== | |
| "st_ba": "st_ba.json", | |
| "st_ce": "st_ce.json", | |
| "st_cse": "st_cse.json", | |
| "st_ece": "st_ece.json", | |
| "st_economics": "st_economics.json", | |
| "st_eee": "st_eee.json", | |
| "st_english": "st_english.json", | |
| "st_geb": "st_geb.json", | |
| "st_information_studies": "st_information_studies.json", | |
| "st_law": "st_law.json", | |
| "st_math": "st_math.json", | |
| "st_pharmacy": "st_pharmacy.json", | |
| "st_social_relations": "st_social_relations.json", | |
| "st_sociology": "st_sociology.json", | |
| } | |
| # ============================================================================ | |
| # UTILITY FUNCTIONS | |
| # ============================================================================ | |
| def load_from_github(filename: str) -> dict: | |
| """Load JSON data from GitHub repository""" | |
| try: | |
| url = BASE_URL + filename | |
| print(f" π₯ Fetching: {filename}...", end=" ") | |
| response = requests.get(url, timeout=10) | |
| if response.status_code == 200: | |
| print("β ") | |
| return response.json() | |
| else: | |
| print(f"β (Status: {response.status_code})") | |
| return None | |
| except Exception as e: | |
| print(f"β ({str(e)[:50]})") | |
| return None | |
| def flatten_dict(d: dict, parent_key: str = '', sep: str = ' > ') -> str: | |
| """Recursively flatten a dictionary into readable text""" | |
| items = [] | |
| for k, v in d.items(): | |
| new_key = f"{parent_key}{sep}{k}" if parent_key else k | |
| if isinstance(v, dict): | |
| items.append(f"\n{new_key.upper()}:") | |
| items.append(flatten_dict(v, new_key, sep)) | |
| elif isinstance(v, list): | |
| items.append(f"\n{new_key.upper()}:") | |
| for i, item in enumerate(v, 1): | |
| if isinstance(item, dict): | |
| items.append(f"\n [{i}]") | |
| items.append(flatten_dict(item, '', sep)) | |
| else: | |
| items.append(f" - {item}") | |
| else: | |
| items.append(f"{new_key}: {v}") | |
| return "\n".join(items) | |
| # ============================================================================ | |
| # SPECIFIC PROCESSORS | |
| # ============================================================================ | |
| def process_tuition_fees(data: dict) -> List[Dict]: | |
| """Process tuition fees data into document chunks""" | |
| documents = [] | |
| if not data: | |
| return documents | |
| # Process undergraduate programs | |
| if "undergraduate_programs" in data: | |
| # Per credit fees | |
| if "tuition_fees_per_credit" in data["undergraduate_programs"]: | |
| for program in data["undergraduate_programs"]["tuition_fees_per_credit"]: | |
| doc = { | |
| "content": f"""Program: {program['program']} | |
| Fee Per Credit: {program['fee_per_credit']} BDT/credit | |
| Program Type: Undergraduate | |
| Applicable From: {data.get('page_info', {}).get('applicable_from', 'N/A')}""", | |
| "source": "tuition_fees.json", | |
| "metadata": { | |
| "type": "tuition_per_credit", | |
| "program": program['program'], | |
| "level": "undergraduate" | |
| } | |
| } | |
| documents.append(doc) | |
| # Detailed fee structure | |
| if "detailed_fee_structure" in data["undergraduate_programs"]: | |
| for program in data["undergraduate_programs"]["detailed_fee_structure"]: | |
| doc = { | |
| "content": f"""Program: {program['program']} | |
| Total Tuition Fee: {program['tuition_fees']} BDT | |
| Total Credits: {program['credits']} | |
| Grand Total Program Cost: {program['grand_total']} BDT | |
| Program Level: Undergraduate""", | |
| "source": "tuition_fees.json", | |
| "metadata": { | |
| "type": "tuition_detailed", | |
| "program": program['program'], | |
| "level": "undergraduate" | |
| } | |
| } | |
| documents.append(doc) | |
| # Process graduate programs | |
| if "graduate_programs" in data: | |
| if "detailed_fee_structure" in data["graduate_programs"]: | |
| for program in data["graduate_programs"]["detailed_fee_structure"]: | |
| doc = { | |
| "content": f"""Program: {program['program']} | |
| Total Tuition Fee: {program['tuition_fees']} BDT | |
| Total Credits: {program['credits']} | |
| Grand Total Program Cost: {program['grand_total']} BDT | |
| Program Level: Graduate""", | |
| "source": "tuition_fees.json", | |
| "metadata": { | |
| "type": "tuition_detailed", | |
| "program": program['program'], | |
| "level": "graduate" | |
| } | |
| } | |
| documents.append(doc) | |
| # Process fee categories | |
| if "fee_categories" in data: | |
| for fee_name, fee_value in data["fee_categories"].items(): | |
| doc = { | |
| "content": f"""{fee_name.replace('_', ' ').title()}: {fee_value} | |
| Fee Type: {fee_name}""", | |
| "source": "tuition_fees.json", | |
| "metadata": { | |
| "type": "fee_category", | |
| "fee_name": fee_name | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| def process_admission_calendar(data: dict) -> List[Dict]: | |
| """Process admission calendar/deadlines""" | |
| documents = [] | |
| if not data: | |
| return documents | |
| # Undergraduate admission deadlines | |
| if "undergraduate_admission" in data: | |
| for program in data["undergraduate_admission"]: | |
| doc = { | |
| "content": f"""Program: {program['program']} | |
| Application Deadline: {program['application_deadline']} | |
| Admission Test Date: {program['admission_test']} | |
| Semester: {data.get('page_info', {}).get('semester', 'N/A')} | |
| Program Level: Undergraduate""", | |
| "source": "admission_calendar.json", | |
| "metadata": { | |
| "type": "admission_deadline", | |
| "program": program['program'], | |
| "level": "undergraduate" | |
| } | |
| } | |
| documents.append(doc) | |
| # Graduate admission deadlines | |
| if "graduate_admission" in data: | |
| for program in data["graduate_admission"]: | |
| doc = { | |
| "content": f"""Program: {program['program']} | |
| Application Deadline: {program['application_deadline']} | |
| Admission Test Date: {program['admission_test']} | |
| Semester: {data.get('page_info', {}).get('semester', 'N/A')} | |
| Program Level: Graduate""", | |
| "source": "admission_calendar.json", | |
| "metadata": { | |
| "type": "admission_deadline", | |
| "program": program['program'], | |
| "level": "graduate" | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| def process_admission_process(data: dict) -> List[Dict]: | |
| """Process admission process/procedures""" | |
| documents = [] | |
| if not data: | |
| return documents | |
| content = flatten_dict(data) | |
| doc = { | |
| "content": f"""Admission Process Information: | |
| {content}""", | |
| "source": "admission_process.json", | |
| "metadata": { | |
| "type": "admission_process" | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| def process_admission_requirements(data: dict) -> List[Dict]: | |
| """Process admission requirements""" | |
| documents = [] | |
| if not data or "admission_requirements" not in data: | |
| return documents | |
| reqs = data["admission_requirements"] | |
| # Undergraduate requirements (general) | |
| if "undergraduate" in reqs and "general_programs_except_bpharm" in reqs["undergraduate"]: | |
| ug = reqs["undergraduate"]["general_programs_except_bpharm"] | |
| doc = { | |
| "content": f"""Undergraduate Admission Requirements (General Programs except B.Pharm): | |
| Academic Requirements: | |
| - SSC/HSC: {ug.get('ssc_hsc', 'N/A')} | |
| - Diploma: {ug.get('diploma', 'N/A')} | |
| - O/A Levels: {ug.get('o_a_levels_requirement', 'N/A')} | |
| Admission Test Weightage: | |
| - Admission Test: {ug.get('admission_test_weightage', {}).get('admission_test', 'N/A')} | |
| - SSC/O Level: {ug.get('admission_test_weightage', {}).get('ssc_o_level', 'N/A')} | |
| - HSC/A Level: {ug.get('admission_test_weightage', {}).get('hsc_a_level', 'N/A')} | |
| Subject Requirements: | |
| - CSE: {ug.get('subject_requirements', {}).get('cse', 'N/A')}""", | |
| "source": "admission_requirements.json", | |
| "metadata": { | |
| "type": "admission_requirements", | |
| "level": "undergraduate", | |
| "program_type": "general" | |
| } | |
| } | |
| documents.append(doc) | |
| # B.Pharm specific requirements | |
| if "undergraduate" in reqs and "b_pharm" in reqs["undergraduate"]: | |
| pharm = reqs["undergraduate"]["b_pharm"] | |
| doc = { | |
| "content": f"""B.Pharm (Bachelor of Pharmacy) Admission Requirements: | |
| - Citizenship: {pharm.get('citizenship', 'N/A')} | |
| - SSC+HSC Aggregate: {pharm.get('ssc_hsc_aggregate', 'N/A')} | |
| - SSC+HSC Minimum Each: {pharm.get('ssc_hsc_minimum_each', 'N/A')} | |
| - Year of Passing: {pharm.get('year_of_pass', 'N/A')} | |
| Subject GPA Requirements: | |
| - Chemistry: {pharm.get('subject_gpa', {}).get('chemistry', 'N/A')} | |
| - Biology: {pharm.get('subject_gpa', {}).get('biology', 'N/A')} | |
| - Physics: {pharm.get('subject_gpa', {}).get('physics', 'N/A')} | |
| - Mathematics: {pharm.get('subject_gpa', {}).get('mathematics', 'N/A')} | |
| Special Note: {pharm.get('special_note', '')}""", | |
| "source": "admission_requirements.json", | |
| "metadata": { | |
| "type": "admission_requirements", | |
| "level": "undergraduate", | |
| "program": "B.Pharm" | |
| } | |
| } | |
| documents.append(doc) | |
| # Graduate requirements (MBA/EMBA) | |
| if "graduate" in reqs and "mba_emba" in reqs["graduate"]: | |
| mba = reqs["graduate"]["mba_emba"] | |
| doc = { | |
| "content": f"""MBA/EMBA Admission Requirements: | |
| - Degree: {mba.get('degree', 'N/A')} | |
| - SSC+HSC+Graduate GPA: {mba.get('ssc_hsc_graduate_gpa', 'N/A')} | |
| Work Experience: | |
| - MBA: {mba.get('mba_work_experience', 'N/A')} | |
| - EMBA: {mba.get('emba_work_experience', 'N/A')} | |
| Test Exemptions: | |
| - EWU Graduates: {mba.get('test_exemptions', {}).get('ewu_graduates', 'N/A')} | |
| - Other Universities: {mba.get('test_exemptions', {}).get('other_universities', 'N/A')}""", | |
| "source": "admission_requirements.json", | |
| "metadata": { | |
| "type": "admission_requirements", | |
| "level": "graduate", | |
| "program": "MBA/EMBA" | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| def process_facilities(data: dict) -> List[Dict]: | |
| """Process facilities information""" | |
| documents = [] | |
| if not data or "facilities" not in data: | |
| return documents | |
| facilities = data["facilities"] | |
| # Campus life facilities | |
| if "campus_life" in facilities and "available" in facilities["campus_life"]: | |
| for facility in facilities["campus_life"]["available"]: | |
| doc = { | |
| "content": f"""Facility: {facility['name']} | |
| Description: {facility['description']} | |
| Category: Campus Life""", | |
| "source": "facilities.json", | |
| "metadata": { | |
| "type": "facility", | |
| "facility_name": facility['name'] | |
| } | |
| } | |
| documents.append(doc) | |
| # Engineering labs | |
| if "engineering_labs" in facilities: | |
| labs_info = facilities["engineering_labs"] | |
| labs_content = f"""Engineering Laboratories at EWU | |
| Departments: {', '.join(labs_info.get('departments', []))} | |
| Available Labs: | |
| """ | |
| for lab in labs_info.get('labs', []): | |
| labs_content += f"- {lab['name']}\n" | |
| doc = { | |
| "content": labs_content, | |
| "source": "facilities.json", | |
| "metadata": { | |
| "type": "facility", | |
| "category": "engineering_labs" | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| def process_faculty(data: dict) -> List[Dict]: | |
| """Process faculty information""" | |
| documents = [] | |
| if not data or "faculty" not in data: | |
| return documents | |
| # Process each department's faculty | |
| for dept_key, dept_data in data["faculty"].items(): | |
| if isinstance(dept_data, dict) and "members" in dept_data: | |
| dept_name = dept_data.get("department_name", dept_key) | |
| for member in dept_data["members"]: | |
| doc = { | |
| "content": f"""Faculty Member: {member.get('name', 'N/A')} | |
| Department: {dept_name} | |
| Designation: {member.get('designation', 'N/A')} | |
| Specialization: {member.get('specialization', 'N/A')} | |
| Email: {member.get('email', 'N/A')} | |
| Office: {member.get('office', 'N/A')}""", | |
| "source": "faculty.json", | |
| "metadata": { | |
| "type": "faculty", | |
| "department": dept_name, | |
| "name": member.get('name', 'Unknown') | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| def process_events(data: dict) -> List[Dict]: | |
| """Process events and workshops""" | |
| documents = [] | |
| if not data or "events" not in data: | |
| return documents | |
| for event in data["events"]: | |
| doc = { | |
| "content": f"""Event: {event.get('title', 'N/A')} | |
| Date: {event.get('date', 'N/A')} | |
| Description: {event.get('description', 'N/A')} | |
| Organizer: {event.get('organizer', 'N/A')} | |
| Venue: {event.get('venue', 'N/A')}""", | |
| "source": "events_workshops.json", | |
| "metadata": { | |
| "type": "event", | |
| "title": event.get('title', 'Unknown') | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| def process_grading(data: dict) -> List[Dict]: | |
| """Process grading system information""" | |
| documents = [] | |
| if not data or "grading_system" not in data: | |
| return documents | |
| grading = data["grading_system"] | |
| # Main grading system info | |
| content = f"""{grading.get('title', 'Grading System')} | |
| {grading.get('description', '')} | |
| Grade Scale: | |
| """ | |
| for grade in grading.get('grade_scale', []): | |
| content += f"- {grade.get('letter_grade', '')}: {grade.get('numerical_score', '')} (Grade Point: {grade.get('grade_point', '')})\n" | |
| content += "\nSpecial Grades:\n" | |
| for spec_grade in grading.get('special_grades', []): | |
| content += f"- {spec_grade.get('grade', '')}: {spec_grade.get('description', '')}\n" | |
| doc = { | |
| "content": content, | |
| "source": "grading.json", | |
| "metadata": { | |
| "type": "grading_system" | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| # ============================================================================ | |
| # GENERIC PROCESSORS | |
| # ============================================================================ | |
| def process_generic(data: dict, source_name: str, category: str) -> List[Dict]: | |
| """Universal generic processor for any JSON file""" | |
| documents = [] | |
| if not data: | |
| return documents | |
| content = flatten_dict(data) | |
| doc = { | |
| "content": f"""{source_name.replace('_', ' ').title()} Information: | |
| {content}""", | |
| "source": f"{source_name}.json", | |
| "metadata": { | |
| "type": category, | |
| "source": source_name | |
| } | |
| } | |
| documents.append(doc) | |
| return documents | |
| # ============================================================================ | |
| # MAIN FUNCTION | |
| # ============================================================================ | |
| def main(): | |
| print("="*70) | |
| print("π¨ EWU RAG KNOWLEDGE BASE BUILDER") | |
| print("="*70) | |
| print(f"π Total files to process: {len(GITHUB_DATA_SOURCES)}") | |
| # Initialize vector store | |
| print("\nπ¦ Initializing vector store...") | |
| vector_store = VectorStore( | |
| index_path="./data/faiss_index", | |
| embedding_model="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| all_documents = [] | |
| # Specific processors (for complex structured data) | |
| specific_processors = { | |
| "tuition_fees": process_tuition_fees, | |
| "admission_calendar": process_admission_calendar, | |
| "admission_process": process_admission_process, | |
| "admission_requirements": process_admission_requirements, | |
| "facilities": process_facilities, | |
| "faculty": process_faculty, | |
| "events_workshops": process_events, | |
| "grading": process_grading, | |
| } | |
| # Load and process each data source from GitHub | |
| print("\nπ Fetching data from GitHub repository...\n") | |
| success_count = 0 | |
| fail_count = 0 | |
| for source_name, filename in GITHUB_DATA_SOURCES.items(): | |
| # Load data from GitHub | |
| data = load_from_github(filename) | |
| if data: | |
| try: | |
| # Use specific processor if available, otherwise use generic | |
| if source_name in specific_processors: | |
| docs = specific_processors[source_name](data) | |
| else: | |
| # Determine category | |
| if filename.startswith("static_"): | |
| category = "static_info" | |
| elif filename.startswith("dynamic_"): | |
| category = "dynamic_info" | |
| elif filename.startswith("st_"): | |
| category = "undergraduate_program" | |
| elif filename.startswith("m"): | |
| category = "graduate_program" | |
| else: | |
| category = "general_info" | |
| docs = process_generic(data, source_name, category) | |
| all_documents.extend(docs) | |
| success_count += 1 | |
| print(f" β {source_name}: {len(docs)} document(s)") | |
| except Exception as e: | |
| fail_count += 1 | |
| print(f" β {source_name}: Error - {str(e)[:60]}") | |
| else: | |
| fail_count += 1 | |
| # Add documents to vector store | |
| if all_documents: | |
| print(f"\n{'='*70}") | |
| print(f"π¦ Adding {len(all_documents)} documents to vector store...") | |
| print(f"β³ This may take 1-2 minutes...") | |
| vector_store.add_documents(all_documents) | |
| vector_store.save_index() | |
| print(f"β Knowledge base successfully created!") | |
| print(f"{'='*70}") | |
| # Summary | |
| print(f"\nπ Processing Summary:") | |
| print(f" β Successfully processed: {success_count}/{len(GITHUB_DATA_SOURCES)} files") | |
| print(f" β Failed: {fail_count}/{len(GITHUB_DATA_SOURCES)} files") | |
| print(f" π Total documents: {len(all_documents)}") | |
| print(f"\nπ Document Type Breakdown:") | |
| type_counts = {} | |
| for doc in all_documents: | |
| doc_type = doc['metadata'].get('type', 'unknown') | |
| type_counts[doc_type] = type_counts.get(doc_type, 0) + 1 | |
| for doc_type, count in sorted(type_counts.items()): | |
| print(f" β’ {doc_type}: {count}") | |
| print(f"\nπΎ Index saved to: ./data/faiss_index") | |
| print(f"π Files:") | |
| print(f" β’ index.faiss (vector index)") | |
| print(f" β’ documents.json (document metadata)") | |
| print(f"\nπ Ready to start RAG server!") | |
| print(f" Command: python rag_server.py\n") | |
| else: | |
| print("\nβ οΈ WARNING: No documents were processed!") | |
| print(" Check:") | |
| print(" 1. Network connection") | |
| print(" 2. GitHub repository URL") | |
| print(" 3. File names in GITHUB_DATA_SOURCES\n") | |
| if __name__ == "__main__": | |
| main() | |