yyzsna commited on
Commit
102c695
·
verified ·
1 Parent(s): 45c9cd9

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ pip-wheel-metadata/
20
+ share/python-wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+ MANIFEST
25
+ tabs/__pycache__/
26
+ .gradio
27
+
28
+ # Virtual Environment
29
+ .venv/
30
+ .env/
31
+ venv/
32
+ ENV/
33
+ env/
34
+ .venv
35
+ myenv/
36
+ gradio/
37
+
38
+ # Environment Variables
39
+ .env
40
+ .env.local
41
+ .env.development.local
42
+ .env.test.local
43
+ .env.production.local
44
+
45
+ # IDE
46
+ .vscode/
47
+ .idea/
48
+ *.swp
49
+ *.swo
50
+ *~
51
+
52
+ # macOS
53
+ .DS_Store
54
+ .AppleDouble
55
+ .LSOverride
56
+
57
+ # Windows
58
+ Thumbs.db
59
+ ehthumbs.db
60
+ Desktop.ini
61
+
62
+ # Jupyter Notebooks
63
+ .ipynb_checkpoints
64
+
65
+ # AI/ML specific
66
+ chroma_db/
67
+ chromadb/
68
+ *.db
69
+ *.sqlite
70
+ *.sqlite3
71
+
72
+ # Document storage
73
+ documents/
74
+ uploaded_documents/
75
+ temp_documents/
76
+
77
+ # Query results and cache
78
+ query_results/
79
+ .cache/
80
+ .streamlit/
81
+
82
+ # Model downloads and cache
83
+ models/
84
+ .transformers_cache/
85
+ .huggingface/
86
+ sentence_transformers_cache/
87
+
88
+ # Logs
89
+ *.log
90
+ logs/
91
+ .logs/
92
+
93
+ # Temporary files
94
+ tmp/
95
+ temp/
96
+ .tmp/
97
+
98
+ # Coverage reports
99
+ htmlcov/
100
+ .tox/
101
+ .coverage
102
+ .coverage.*
103
+ .cache
104
+ nosetests.xml
105
+ coverage.xml
106
+ *.cover
107
+ .hypothesis/
108
+ .pytest_cache/
109
+
110
+ # mypy
111
+ .mypy_cache/
112
+ .dmypy.json
113
+ dmypy.json
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
- title: TopEdu Demo
3
- emoji: 📉
4
- colorFrom: indigo
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
 
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: TopEdu_Demo
3
+ emoji: 🎓
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ python_version: 3.11
8
+ sdk_version: "5.42.0"
9
+ app_file: app_gradio.py
10
  pinned: false
11
  ---
12
 
13
+ # TopEdu Gradio App
14
+
15
+ This is a demo of the TopEdu application.
app_gradio.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PANSEA University Requirements Assistant - Gradio Version (Modular)
3
+ A comprehensive tool for navigating university admission requirements across Southeast Asia.
4
+ """
5
+ import gradio as gr
6
+ import os
7
+ import sys
8
+ from datetime import datetime
9
+
10
+ # Add the current directory to Python path for imports
11
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
12
+
13
+ # Import our RAG system
14
+ from utils.rag_system import DocumentIngestion, RAGSystem
15
+
16
+ # Import modular tab components
17
+ from tabs.initialize import create_initialize_tab
18
+ from tabs.upload import create_upload_tab
19
+ from tabs.query import create_query_tab
20
+ from tabs.manage import create_manage_tab
21
+ from tabs.help import create_help_tab
22
+
23
+ def create_interface():
24
+ """Create the main Gradio interface using modular components"""
25
+
26
+ # Global state management - shared across all tabs
27
+ global_vars = {
28
+ 'doc_ingestion': None,
29
+ 'rag_system': None,
30
+ 'vectorstore': None
31
+ }
32
+
33
+ # Custom CSS for better styling
34
+ custom_css = """
35
+ .gradio-container {
36
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
37
+ }
38
+ .tab-nav button {
39
+ font-weight: 500;
40
+ font-size: 14px;
41
+ }
42
+ .tab-nav button[aria-selected="true"] {
43
+ background: linear-gradient(45deg, #1e3a8a, #3b82f6);
44
+ color: white;
45
+ }
46
+ .feedback-box {
47
+ background: #f8fafc;
48
+ border: 1px solid #e2e8f0;
49
+ border-radius: 8px;
50
+ padding: 16px;
51
+ margin: 8px 0;
52
+ }
53
+ .success-message {
54
+ background: #dcfce7;
55
+ color: #166534;
56
+ border: 1px solid #bbf7d0;
57
+ padding: 12px;
58
+ border-radius: 6px;
59
+ margin: 8px 0;
60
+ }
61
+ .error-message {
62
+ background: #fef2f2;
63
+ color: #dc2626;
64
+ border: 1px solid #fecaca;
65
+ padding: 12px;
66
+ border-radius: 6px;
67
+ margin: 8px 0;
68
+ }
69
+ """
70
+
71
+ # Create the main interface
72
+ with gr.Blocks(
73
+ title="🌏 PANSEA University Assistant",
74
+ theme=gr.themes.Soft(
75
+ primary_hue="blue",
76
+ secondary_hue="slate"
77
+ ),
78
+ css=custom_css,
79
+ analytics_enabled=False
80
+ ) as interface:
81
+
82
+ # Header
83
+ gr.Markdown("""
84
+ # 🌏 TopEdu
85
+
86
+ **Navigate University Admission Requirements Across Southeast Asia with AI-Powered Assistance**
87
+
88
+ Upload university documents, ask questions, and get intelligent answers about admission requirements,
89
+ programs, deadlines, and more across Southeast Asian universities.
90
+
91
+ ---
92
+ """)
93
+
94
+ # Main tabs using modular components
95
+ with gr.Tabs():
96
+ create_initialize_tab(global_vars)
97
+ create_upload_tab(global_vars)
98
+ create_query_tab(global_vars)
99
+ create_manage_tab(global_vars)
100
+ create_help_tab(global_vars)
101
+
102
+ # Footer
103
+ gr.Markdown(f"""
104
+ ---
105
+
106
+ **🔧 System Status**: Ready | **📅 Session**: {datetime.now().strftime('%Y-%m-%d %H:%M')} | **🔄 Version**: Modular Gradio
107
+
108
+ 💡 **Tip**: Start by initializing the system, then upload your university documents, and begin querying!
109
+ """)
110
+
111
+ return interface
112
+
113
+ def main():
114
+ """Launch the application"""
115
+ interface = create_interface()
116
+
117
+ # Launch configuration
118
+ interface.launch(
119
+ share=False, # Set to True for public sharing
120
+ server_name="0.0.0.0", # Allow external connections
121
+ server_port=7860, # Default Gradio port
122
+ show_api=False, # Hide API documentation
123
+ show_error=True, # Show detailed error messages
124
+ quiet=False, # Show startup messages
125
+ favicon_path=None, # Could add custom favicon
126
+ app_kwargs={
127
+ "docs_url": None, # Disable FastAPI docs
128
+ "redoc_url": None # Disable ReDoc docs
129
+ }
130
+ )
131
+
132
+ if __name__ == "__main__":
133
+ print("🚀 Starting PANSEA University Requirements Assistant...")
134
+ print("📍 Access the application at: http://localhost:7860")
135
+ print("🔗 For public sharing, set share=True in the launch() method")
136
+ print("-" * 60)
137
+ main()
requirements.txt ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.12.15
4
+ aiosignal==1.4.0
5
+ altair==5.5.0
6
+ annotated-types==0.7.0
7
+ anyio==4.10.0
8
+ attrs==25.3.0
9
+ backoff==2.2.1
10
+ bcrypt==4.3.0
11
+ blinker==1.9.0
12
+ Brotli==1.1.0
13
+ build==1.3.0
14
+ cachetools==5.5.2
15
+ certifi==2025.8.3
16
+ charset-normalizer==3.4.3
17
+ chromadb==1.0.16
18
+ click==8.2.1
19
+ coloredlogs==15.0.1
20
+ dataclasses-json==0.6.7
21
+ distro==1.9.0
22
+ durationpy==0.10
23
+ fastapi==0.116.1
24
+ ffmpy==0.6.1
25
+ filelock==3.18.0
26
+ flatbuffers==25.2.10
27
+ frozenlist==1.7.0
28
+ fsspec==2025.7.0
29
+ gitdb==4.0.12
30
+ GitPython==3.1.45
31
+ google-auth==2.40.3
32
+ googleapis-common-protos==1.70.0
33
+ gradio==5.42.0
34
+ gradio_client==1.11.1
35
+ groovy==0.1.2
36
+ grpcio==1.74.0
37
+ h11==0.16.0
38
+ hf-xet==1.1.7
39
+ httpcore==1.0.9
40
+ httptools==0.6.4
41
+ httpx==0.28.1
42
+ httpx-sse==0.4.1
43
+ huggingface-hub==0.34.4
44
+ humanfriendly==10.0
45
+ idna==3.10
46
+ importlib_metadata==8.7.0
47
+ importlib_resources==6.5.2
48
+ Jinja2==3.1.6
49
+ jiter==0.10.0
50
+ joblib==1.5.1
51
+ jsonpatch==1.33
52
+ jsonpointer==3.0.0
53
+ jsonschema==4.25.0
54
+ jsonschema-specifications==2025.4.1
55
+ kubernetes==33.1.0
56
+ langchain==0.3.27
57
+ langchain-community==0.3.27
58
+ langchain-core==0.3.74
59
+ langchain-openai==0.3.29
60
+ langchain-text-splitters==0.3.9
61
+ langsmith==0.4.13
62
+ markdown-it-py==4.0.0
63
+ MarkupSafe==3.0.2
64
+ marshmallow==3.26.1
65
+ mdurl==0.1.2
66
+ mmh3==5.2.0
67
+ mpmath==1.3.0
68
+ multidict==6.6.4
69
+ mypy_extensions==1.1.0
70
+ narwhals==2.1.0
71
+ networkx==3.5
72
+ numpy==2.3.2
73
+ oauthlib==3.3.1
74
+ onnxruntime==1.22.1
75
+ openai==1.99.9
76
+ opentelemetry-api==1.36.0
77
+ opentelemetry-exporter-otlp-proto-common==1.36.0
78
+ opentelemetry-exporter-otlp-proto-grpc==1.36.0
79
+ opentelemetry-proto==1.36.0
80
+ opentelemetry-sdk==1.36.0
81
+ opentelemetry-semantic-conventions==0.57b0
82
+ orjson==3.11.1
83
+ overrides==7.7.0
84
+ packaging==25.0
85
+ pandas==2.3.1
86
+ pillow==11.3.0
87
+ posthog==5.4.0
88
+ propcache==0.3.2
89
+ protobuf==6.31.1
90
+ pyarrow==21.0.0
91
+ pyasn1==0.6.1
92
+ pyasn1_modules==0.4.2
93
+ pybase64==1.4.2
94
+ pycryptodome==3.23.0
95
+ pydantic==2.11.7
96
+ pydantic-settings==2.10.1
97
+ pydantic_core==2.33.2
98
+ pydeck==0.9.1
99
+ pydub==0.25.1
100
+ Pygments==2.19.2
101
+ PyPDF2==3.0.1
102
+ PyPika==0.48.9
103
+ pyproject_hooks==1.2.0
104
+ python-dateutil==2.9.0.post0
105
+ python-dotenv==1.1.1
106
+ python-multipart==0.0.20
107
+ pytz==2025.2
108
+ PyYAML==6.0.2
109
+ referencing==0.36.2
110
+ regex==2025.7.34
111
+ requests==2.32.4
112
+ requests-oauthlib==2.0.0
113
+ requests-toolbelt==1.0.0
114
+ rich==14.1.0
115
+ rpds-py==0.27.0
116
+ rsa==4.9.1
117
+ ruff==0.12.8
118
+ safehttpx==0.1.6
119
+ safetensors==0.6.2
120
+ scikit-learn==1.7.1
121
+ scipy==1.16.1
122
+ semantic-version==2.10.0
123
+ sentence-transformers==5.1.0
124
+ shellingham==1.5.4
125
+ six==1.17.0
126
+ smmap==5.0.2
127
+ sniffio==1.3.1
128
+ SQLAlchemy==2.0.43
129
+ starlette==0.47.2
130
+ streamlit==1.48.0
131
+ sympy==1.14.0
132
+ tenacity==9.1.2
133
+ threadpoolctl==3.6.0
134
+ tiktoken==0.11.0
135
+ tokenizers==0.21.4
136
+ toml==0.10.2
137
+ tomlkit==0.13.3
138
+ torch==2.8.0
139
+ tornado==6.5.2
140
+ tqdm==4.67.1
141
+ transformers==4.55.0
142
+ typer==0.16.0
143
+ typing-inspect==0.9.0
144
+ typing-inspection==0.4.1
145
+ typing_extensions==4.14.1
146
+ tzdata==2025.2
147
+ urllib3==2.5.0
148
+ uvicorn==0.35.0
149
+ uvloop==0.21.0
150
+ watchfiles==1.1.0
151
+ websocket-client==1.8.0
152
+ websockets==15.0.1
153
+ yarl==1.20.1
154
+ zipp==3.23.0
155
+ zstandard==0.23.0
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.11
sample_documents/sample_university_requirements.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sample University Admission Requirements
2
+
3
+ ## National University of Singapore (NUS) - Computer Science Master's Program
4
+
5
+ ### Program Overview
6
+ The Master of Computing (Computer Science) program at NUS is a comprehensive graduate program designed for students seeking advanced knowledge in computer science.
7
+
8
+ ### Admission Requirements
9
+
10
+ #### Academic Requirements
11
+ - Bachelor's degree in Computer Science, Computer Engineering, or related field
12
+ - Minimum GPA of 3.5/4.0 or equivalent (Second Class Upper Honours)
13
+ - Strong background in mathematics and programming
14
+
15
+ #### English Proficiency Requirements
16
+ For international students whose native language is not English:
17
+ - IELTS: Minimum overall score of 6.5 (no band less than 6.0)
18
+ - TOEFL iBT: Minimum score of 85 (writing minimum 22)
19
+ - PTE Academic: Minimum score of 65
20
+
21
+ #### Additional Requirements
22
+ - Statement of Purpose (500-1000 words)
23
+ - Two letters of recommendation from academic or professional referees
24
+ - Resume/CV highlighting relevant experience
25
+ - Portfolio of programming projects (preferred)
26
+
27
+ ### Tuition Fees (2024-2025 Academic Year)
28
+ - Singapore Citizens: S$12,500 per year
29
+ - Singapore Permanent Residents: S$17,500 per year
30
+ - International Students: S$25,000 per year
31
+
32
+ ### Application Deadlines
33
+ - **Priority Round**: November 15, 2024
34
+ - **Regular Round**: January 31, 2025
35
+ - **Late Round**: March 15, 2025 (subject to availability)
36
+
37
+ ### Application Process
38
+ 1. Submit online application through NUS Graduate School portal
39
+ 2. Upload required documents
40
+ 3. Pay application fee of S$50
41
+ 4. Submit by deadline
42
+ 5. Attend interview if shortlisted (February-April)
43
+ 6. Admission results: April-May
44
+
45
+ ### Program Duration
46
+ - Full-time: 1.5 years (3 semesters)
47
+ - Part-time: 2.5 years (5 semesters)
48
+
49
+ ### Financial Aid
50
+ - NUS Graduate Research Scholarship available for qualifying students
51
+ - Teaching assistantships for outstanding applicants
52
+ - Industry sponsorship opportunities
53
+
54
+ ### Contact Information
55
+ - Email: gradsch@nus.edu.sg
56
+ - Phone: +65 6516 2492
57
+ - Website: www.nus.edu.sg/graduateschool
58
+
59
+ ---
60
+
61
+ ## University of Malaya (UM) - Engineering Master's Programs
62
+
63
+ ### Program Overview
64
+ The Faculty of Engineering offers various Master's degree programs in engineering disciplines.
65
+
66
+ ### Admission Requirements
67
+
68
+ #### Academic Requirements
69
+ - Bachelor's degree in Engineering or related field with minimum CGPA of 3.0/4.0
70
+ - For applicants with CGPA below 3.0, relevant work experience of at least 2 years required
71
+
72
+ #### English Proficiency Requirements
73
+ For international students:
74
+ - IELTS: Minimum overall score of 6.0 (no band less than 5.5)
75
+ - TOEFL iBT: Minimum score of 80
76
+ - MUET (Malaysian University English Test): Band 4 minimum
77
+
78
+ #### Program-Specific Requirements
79
+ - **Civil Engineering**: AutoCAD proficiency preferred
80
+ - **Electrical Engineering**: Basic knowledge of circuit analysis
81
+ - **Mechanical Engineering**: Thermodynamics and fluid mechanics background
82
+
83
+ ### Tuition Fees (2024 Academic Year)
84
+ - Malaysian Citizens: RM 8,000 per year
85
+ - International Students: RM 15,000 per year
86
+ - ASEAN Students: RM 12,000 per year (special rate)
87
+
88
+ ### Application Deadlines
89
+ - **Main Intake (September)**: April 30, 2024
90
+ - **Second Intake (February)**: October 31, 2024
91
+
92
+ ### Scholarships Available
93
+ - UM Graduate Merit Scholarship (50% tuition fee waiver)
94
+ - ASEAN Scholarship Program
95
+ - Industry-sponsored scholarships
96
+
97
+ ### Living Costs (Estimated per month)
98
+ - Accommodation: RM 500-800
99
+ - Food: RM 400-600
100
+ - Transportation: RM 100-200
101
+ - Other expenses: RM 200-300
102
+ - **Total: RM 1,200-1,900 per month**
103
+
104
+ ### Application Requirements
105
+ 1. Completed application form
106
+ 2. Academic transcripts
107
+ 3. Bachelor's degree certificate
108
+ 4. English proficiency test results
109
+ 5. Two reference letters
110
+ 6. Research proposal (for research-based programs)
111
+ 7. Passport copy
112
+ 8. Passport-sized photographs
113
+
114
+ ### Contact Information
115
+ - Email: admission@um.edu.my
116
+ - Phone: +603 7967 3026
117
+ - Address: Faculty of Engineering, University of Malaya, 50603 Kuala Lumpur, Malaysia
118
+
119
+ ---
120
+
121
+ ## Chulalongkorn University - Business Administration Master's (MBA)
122
+
123
+ ### Program Overview
124
+ The Chulalongkorn Business School MBA program is Thailand's premier business education program.
125
+
126
+ ### Admission Requirements
127
+
128
+ #### Academic Requirements
129
+ - Bachelor's degree from accredited institution
130
+ - Minimum GPA of 2.75/4.0 or equivalent
131
+ - GMAT score of 500+ (preferred) or GRE equivalent
132
+ - Minimum 2 years of work experience
133
+
134
+ #### English Proficiency Requirements
135
+ - TOEFL iBT: Minimum score of 79
136
+ - IELTS: Minimum overall score of 6.5
137
+ - CU-TEP: Minimum score of 80
138
+
139
+ ### Tuition Fees (2024-2025)
140
+ - Full-time MBA: 850,000 THB (approx. USD 25,000) total program
141
+ - Executive MBA: 1,200,000 THB (approx. USD 35,000) total program
142
+
143
+ ### Application Deadlines
144
+ - **Early Admission**: February 15, 2024
145
+ - **Regular Admission**: April 30, 2024
146
+ - **Final Round**: June 15, 2024
147
+
148
+ ### Program Duration
149
+ - Full-time MBA: 16 months
150
+ - Executive MBA: 18 months (weekend classes)
151
+
152
+ ### Scholarships
153
+ - Merit-based scholarships up to 50% tuition
154
+ - Corporate sponsorship opportunities
155
+ - Government scholarships for ASEAN students
156
+
157
+ ### Application Process
158
+ 1. Online application submission
159
+ 2. Submit required documents
160
+ 3. GMAT/GRE scores
161
+ 4. Personal interview
162
+ 5. Group discussion assessment
163
+
164
+ ### Career Support
165
+ - Career counseling services
166
+ - Industry networking events
167
+ - Internship placement assistance
168
+ - Alumni network access
169
+
170
+ ### Contact Information
171
+ - Email: mba@cbs.chula.ac.th
172
+ - Phone: +66 2 218 6601
173
+ - Website: www.cbs.chula.ac.th
174
+
175
+ ---
176
+
177
+ *This document contains sample admission information for demonstration purposes. Please verify all details with the respective universities before applying.*
semantic_chunking.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Semantic Chunker Module for RAG Systems
3
+ ======================================
4
+
5
+ A drop-in replacement for RecursiveCharacterTextSplitter that uses semantic similarity
6
+ to create more coherent chunks. Designed to work seamlessly with existing LangChain
7
+ and Streamlit RAG systems.
8
+
9
+ Author: AI Assistant
10
+ Compatible with: LangChain, BGE embeddings, OpenAI embeddings, Streamlit
11
+ """
12
+
13
+ import numpy as np
14
+ import re
15
+ from typing import List, Dict, Any, Optional, Union
16
+ from langchain.schema import Document
17
+ import streamlit as st
18
+ from sklearn.metrics.pairwise import cosine_similarity
19
+ import logging
20
+
21
+ # Set up logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ class SemanticChunker:
26
+ """
27
+ Advanced semantic document chunker that creates coherent chunks based on
28
+ semantic similarity rather than fixed character counts.
29
+
30
+ Perfect for university documents, research papers, and policy documents
31
+ where maintaining semantic coherence is crucial.
32
+ """
33
+
34
+ def __init__(self,
35
+ embeddings_model,
36
+ chunk_size: int = 4,
37
+ overlap: int = 1,
38
+ similarity_threshold: float = 0.75,
39
+ min_chunk_size: int = 150,
40
+ max_chunk_size: int = 1500,
41
+ sentence_split_pattern: Optional[str] = None,
42
+ debug: bool = False):
43
+ """
44
+ Initialize the semantic chunker.
45
+
46
+ Args:
47
+ embeddings_model: Your existing embeddings model (BGE, OpenAI, etc.)
48
+ chunk_size: Base number of sentences per chunk (default: 4)
49
+ overlap: Number of sentences to overlap between chunks (default: 1)
50
+ similarity_threshold: Cosine similarity threshold for extending chunks (0.0-1.0)
51
+ min_chunk_size: Minimum characters per chunk (skip smaller chunks)
52
+ max_chunk_size: Maximum characters per chunk (prevent overly large chunks)
53
+ sentence_split_pattern: Custom regex pattern for sentence splitting
54
+ debug: Enable debug logging and statistics
55
+ """
56
+ self.embeddings_model = embeddings_model
57
+ self.chunk_size = chunk_size
58
+ self.overlap = overlap
59
+ self.similarity_threshold = similarity_threshold
60
+ self.min_chunk_size = min_chunk_size
61
+ self.max_chunk_size = max_chunk_size
62
+ self.debug = debug
63
+
64
+ # Default sentence splitting pattern optimized for academic/university documents
65
+ self.sentence_pattern = sentence_split_pattern or r'[.!?]+\s+'
66
+
67
+ # Statistics tracking
68
+ self.stats = {
69
+ "total_documents": 0,
70
+ "total_chunks": 0,
71
+ "avg_chunk_size": 0,
72
+ "chunking_methods": {},
73
+ "embedding_errors": 0
74
+ }
75
+
76
+ if self.debug:
77
+ logger.info(f"Initialized SemanticChunker with threshold={similarity_threshold}")
78
+
79
+ def _detect_embedding_model_type(self) -> str:
80
+ """Detect the type of embedding model being used."""
81
+ if hasattr(self.embeddings_model, 'model'):
82
+ # Likely sentence-transformers model (BGE, etc.)
83
+ model_name = getattr(self.embeddings_model.model, 'model_name', 'sentence-transformers')
84
+ return f"sentence-transformers ({model_name})"
85
+ elif hasattr(self.embeddings_model, 'client'):
86
+ # Likely OpenAI
87
+ return "OpenAI"
88
+ else:
89
+ return "Unknown"
90
+
91
+ def _preprocess_text_for_splitting(self, text: str) -> str:
92
+ """
93
+ Preprocess text to handle common formatting issues in university documents.
94
+ """
95
+ # Fix common formatting issues
96
+ fixes = [
97
+ # Add space after periods before capital letters
98
+ (r'([a-z])\.([A-Z])', r'\1. \2'),
99
+ # Add space after numbers with periods
100
+ (r'([0-9]+)\.([A-Z])', r'\1. \2'),
101
+ # Fix missing spaces after question/exclamation marks
102
+ (r'([a-z])\?([A-Z])', r'\1? \2'),
103
+ (r'([a-z])\!([A-Z])', r'\1! \2'),
104
+ # Clean up multiple spaces
105
+ (r'\s+', ' '),
106
+ # Fix bullet points
107
+ (r'•\s*([A-Z])', r'• \1'),
108
+ (r'-\s*([A-Z])', r'- \1'),
109
+ ]
110
+
111
+ processed_text = text
112
+ for pattern, replacement in fixes:
113
+ processed_text = re.sub(pattern, replacement, processed_text)
114
+
115
+ return processed_text.strip()
116
+
117
+ def _split_into_sentences(self, text: str) -> List[str]:
118
+ """
119
+ Advanced sentence splitting optimized for academic documents.
120
+ """
121
+ # Preprocess text
122
+ text = self._preprocess_text_for_splitting(text)
123
+
124
+ # Split on sentence boundaries
125
+ raw_sentences = re.split(self.sentence_pattern, text)
126
+
127
+ # Clean and filter sentences
128
+ sentences = []
129
+ for sentence in raw_sentences:
130
+ sentence = sentence.strip()
131
+
132
+ # Filter out very short sentences, pure numbers, or empty strings
133
+ if len(sentence) >= 10 and not sentence.isdigit() and not re.match(r'^[^\w]*$', sentence):
134
+ sentences.append(sentence)
135
+
136
+ if self.debug:
137
+ logger.info(f"Split text into {len(sentences)} sentences")
138
+
139
+ return sentences
140
+
141
+ def _get_embeddings(self, texts: List[str]) -> Optional[np.ndarray]:
142
+ """
143
+ Get embeddings from the provided model with error handling.
144
+ """
145
+ try:
146
+ if hasattr(self.embeddings_model, 'model'):
147
+ # sentence-transformers model (BGE, etc.)
148
+ embeddings = self.embeddings_model.model.encode(texts)
149
+ return np.array(embeddings)
150
+ elif hasattr(self.embeddings_model, 'embed_documents'):
151
+ # OpenAI or similar API-based embeddings
152
+ embeddings = self.embeddings_model.embed_documents(texts)
153
+ return np.array(embeddings)
154
+ else:
155
+ # Try direct call
156
+ embeddings = self.embeddings_model(texts)
157
+ return np.array(embeddings)
158
+
159
+ except Exception as e:
160
+ self.stats["embedding_errors"] += 1
161
+ if self.debug:
162
+ logger.error(f"Error generating embeddings: {e}")
163
+
164
+ # Show warning in Streamlit if available
165
+ try:
166
+ st.warning(f"⚠️ Embedding error, falling back to simple chunking: {str(e)[:100]}...")
167
+ except:
168
+ pass # Streamlit not available
169
+
170
+ return None
171
+
172
+ def _calculate_semantic_boundaries(self, embeddings: np.ndarray, sentences: List[str]) -> List[int]:
173
+ """
174
+ Find natural semantic boundaries in the text based on embedding similarities.
175
+ """
176
+ boundaries = [0] # Always start with first sentence
177
+
178
+ # Calculate similarities between consecutive sentences
179
+ similarities = []
180
+ for i in range(len(embeddings) - 1):
181
+ sim = cosine_similarity(
182
+ embeddings[i:i+1],
183
+ embeddings[i+1:i+2]
184
+ )[0][0]
185
+ similarities.append(sim)
186
+
187
+ # Find significant drops in similarity (topic boundaries)
188
+ if len(similarities) > 1:
189
+ mean_sim = np.mean(similarities)
190
+ std_sim = np.std(similarities)
191
+ threshold = mean_sim - (0.5 * std_sim) # Adaptive threshold
192
+
193
+ for i, sim in enumerate(similarities):
194
+ if sim < threshold:
195
+ boundaries.append(i + 1)
196
+
197
+ boundaries.append(len(sentences)) # Always end with last sentence
198
+
199
+ return sorted(list(set(boundaries))) # Remove duplicates and sort
200
+
201
+ def _create_chunks_from_boundaries(self, sentences: List[str], boundaries: List[int],
202
+ embeddings: Optional[np.ndarray], metadata: Dict[str, Any]) -> List[Document]:
203
+ """
204
+ Create document chunks based on semantic boundaries.
205
+ """
206
+ chunks = []
207
+
208
+ for i in range(len(boundaries) - 1):
209
+ start_idx = boundaries[i]
210
+ end_idx = boundaries[i + 1]
211
+
212
+ # Create base chunk
213
+ chunk_sentences = sentences[start_idx:end_idx]
214
+
215
+ # Try to extend chunk if semantically similar
216
+ if embeddings is not None and end_idx < len(sentences):
217
+ current_embedding = np.mean(embeddings[start_idx:end_idx], axis=0, keepdims=True)
218
+
219
+ # Check if we can extend the chunk
220
+ extended_end = end_idx
221
+ while extended_end < len(sentences):
222
+ next_sentence_embedding = embeddings[extended_end:extended_end+1]
223
+ similarity = cosine_similarity(current_embedding, next_sentence_embedding)[0][0]
224
+
225
+ if similarity > self.similarity_threshold:
226
+ # Check size limit
227
+ test_chunk = ' '.join(sentences[start_idx:extended_end+1])
228
+ if len(test_chunk) <= self.max_chunk_size:
229
+ extended_end += 1
230
+ # Update current embedding
231
+ current_embedding = np.mean(embeddings[start_idx:extended_end], axis=0, keepdims=True)
232
+ else:
233
+ break
234
+ else:
235
+ break
236
+
237
+ # Use extended chunk if we found extensions
238
+ if extended_end > end_idx:
239
+ chunk_sentences = sentences[start_idx:extended_end]
240
+
241
+ # Create chunk text
242
+ chunk_text = ' '.join(chunk_sentences)
243
+
244
+ # Only add chunks that meet minimum size requirement
245
+ if len(chunk_text) >= self.min_chunk_size:
246
+ chunk_metadata = metadata.copy()
247
+ chunk_metadata.update({
248
+ "chunk_index": len(chunks),
249
+ "sentence_count": len(chunk_sentences),
250
+ "start_sentence": start_idx,
251
+ "end_sentence": start_idx + len(chunk_sentences) - 1,
252
+ "chunking_method": "semantic_boundary",
253
+ "similarity_threshold": self.similarity_threshold,
254
+ "chunk_size_chars": len(chunk_text)
255
+ })
256
+
257
+ chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
258
+
259
+ return chunks
260
+
261
+ def _create_simple_chunks(self, sentences: List[str], metadata: Dict[str, Any]) -> List[Document]:
262
+ """
263
+ Fallback to simple sentence-based chunking when embeddings are unavailable.
264
+ """
265
+ chunks = []
266
+
267
+ for i in range(0, len(sentences), max(1, self.chunk_size - self.overlap)):
268
+ chunk_sentences = sentences[i:i + self.chunk_size]
269
+ chunk_text = ' '.join(chunk_sentences)
270
+
271
+ if len(chunk_text) >= self.min_chunk_size:
272
+ chunk_metadata = metadata.copy()
273
+ chunk_metadata.update({
274
+ "chunk_index": len(chunks),
275
+ "sentence_count": len(chunk_sentences),
276
+ "start_sentence": i,
277
+ "end_sentence": i + len(chunk_sentences) - 1,
278
+ "chunking_method": "simple_fallback",
279
+ "chunk_size_chars": len(chunk_text)
280
+ })
281
+
282
+ chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
283
+
284
+ return chunks
285
+
286
+ def split_documents(self, documents: List[Document]) -> List[Document]:
287
+ """
288
+ Main method: Split documents into semantically coherent chunks.
289
+
290
+ Args:
291
+ documents: List of LangChain Document objects
292
+
293
+ Returns:
294
+ List of Document objects with semantic chunks
295
+ """
296
+ all_chunks = []
297
+ self.stats["total_documents"] = len(documents)
298
+
299
+ for doc_idx, doc in enumerate(documents):
300
+ try:
301
+ # Split document into sentences
302
+ sentences = self._split_into_sentences(doc.page_content)
303
+
304
+ if not sentences:
305
+ if self.debug:
306
+ logger.warning(f"No sentences found in document {doc_idx}")
307
+ continue
308
+
309
+ # Handle very short documents
310
+ if len(sentences) < self.chunk_size:
311
+ chunk_text = ' '.join(sentences)
312
+ if len(chunk_text) >= self.min_chunk_size:
313
+ chunk_metadata = doc.metadata.copy()
314
+ chunk_metadata.update({
315
+ "chunk_index": 0,
316
+ "total_chunks": 1,
317
+ "sentence_count": len(sentences),
318
+ "chunking_method": "single_chunk",
319
+ "chunk_size_chars": len(chunk_text)
320
+ })
321
+ all_chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
322
+ continue
323
+
324
+ # Generate embeddings
325
+ embeddings = self._get_embeddings(sentences)
326
+
327
+ if embeddings is not None:
328
+ # Create semantic chunks
329
+ chunks = self._create_chunks_from_boundaries(sentences, [0, len(sentences)], embeddings, doc.metadata)
330
+ method = "semantic"
331
+ else:
332
+ # Fallback to simple chunking
333
+ chunks = self._create_simple_chunks(sentences, doc.metadata)
334
+ method = "simple_fallback"
335
+
336
+ # Update statistics
337
+ self.stats["chunking_methods"][method] = self.stats["chunking_methods"].get(method, 0) + 1
338
+
339
+ # Update total chunks count in each chunk's metadata
340
+ for chunk in chunks:
341
+ chunk.metadata["total_chunks"] = len(chunks)
342
+ chunk.metadata["source_document_index"] = doc_idx
343
+
344
+ all_chunks.extend(chunks)
345
+
346
+ if self.debug:
347
+ logger.info(f"Document {doc_idx}: {len(sentences)} sentences → {len(chunks)} chunks ({method})")
348
+
349
+ except Exception as e:
350
+ logger.error(f"Error processing document {doc_idx}: {e}")
351
+ if self.debug:
352
+ st.error(f"Error processing document {doc_idx}: {e}")
353
+
354
+ # Update final statistics
355
+ self.stats["total_chunks"] = len(all_chunks)
356
+ if all_chunks:
357
+ chunk_sizes = [len(chunk.page_content) for chunk in all_chunks]
358
+ self.stats["avg_chunk_size"] = sum(chunk_sizes) / len(chunk_sizes)
359
+
360
+ if self.debug:
361
+ logger.info(f"Created {len(all_chunks)} total chunks from {len(documents)} documents")
362
+
363
+ return all_chunks
364
+
365
+ def get_statistics(self) -> Dict[str, Any]:
366
+ """Get chunking statistics for analysis."""
367
+ return self.stats.copy()
368
+
369
+ def display_statistics(self):
370
+ """Display chunking statistics in Streamlit (if available)."""
371
+ try:
372
+ with st.expander("📊 Semantic Chunking Statistics"):
373
+ col1, col2 = st.columns(2)
374
+
375
+ with col1:
376
+ st.metric("Total Documents", self.stats["total_documents"])
377
+ st.metric("Total Chunks", self.stats["total_chunks"])
378
+
379
+ with col2:
380
+ st.metric("Avg Chunk Size", f"{self.stats['avg_chunk_size']:.0f} chars")
381
+ st.metric("Embedding Errors", self.stats["embedding_errors"])
382
+
383
+ if self.stats["chunking_methods"]:
384
+ st.write("**Chunking Methods Used:**")
385
+ for method, count in self.stats["chunking_methods"].items():
386
+ percentage = (count / self.stats["total_documents"]) * 100 if self.stats["total_documents"] > 0 else 0
387
+ st.write(f" - {method}: {count} documents ({percentage:.1f}%)")
388
+
389
+ st.write("**Configuration:**")
390
+ st.json({
391
+ "chunk_size": self.chunk_size,
392
+ "overlap": self.overlap,
393
+ "similarity_threshold": self.similarity_threshold,
394
+ "min_chunk_size": self.min_chunk_size,
395
+ "max_chunk_size": self.max_chunk_size,
396
+ "embedding_model": self._detect_embedding_model_type()
397
+ })
398
+
399
+ except ImportError:
400
+ # Streamlit not available, print to console
401
+ print("\n=== Semantic Chunking Statistics ===")
402
+ print(f"Documents processed: {self.stats['total_documents']}")
403
+ print(f"Chunks created: {self.stats['total_chunks']}")
404
+ print(f"Average chunk size: {self.stats['avg_chunk_size']:.0f} characters")
405
+ print(f"Embedding errors: {self.stats['embedding_errors']}")
406
+ print(f"Chunking methods: {self.stats['chunking_methods']}")
407
+
408
+
409
+ def create_semantic_chunker(embeddings_model, **kwargs) -> SemanticChunker:
410
+ """
411
+ Convenience function to create a semantic chunker with sensible defaults.
412
+
413
+ Args:
414
+ embeddings_model: Your existing embeddings model
415
+ **kwargs: Additional parameters to pass to SemanticChunker
416
+
417
+ Returns:
418
+ SemanticChunker instance ready to use
419
+ """
420
+ return SemanticChunker(embeddings_model=embeddings_model, **kwargs)
styles.css ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .main-header {
2
+ text-align: center;
3
+ padding: 2rem 0;
4
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
5
+ color: white;
6
+ margin: -1rem -1rem 2rem -1rem;
7
+ border-radius: 10px;
8
+ box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
9
+ }
10
+
11
+ .stApp {
12
+ background: var(--background-color);
13
+ }
14
+
15
+ /* Dark theme compatible containers */
16
+ .query-result {
17
+ background: rgba(255, 255, 255, 0.05);
18
+ backdrop-filter: blur(10px);
19
+ border: 1px solid rgba(255, 255, 255, 0.1);
20
+ padding: 1.5rem;
21
+ border-radius: 15px;
22
+ margin: 1rem 0;
23
+ color: var(--text-color);
24
+ }
25
+
26
+ .source-doc {
27
+ background: rgba(31, 119, 180, 0.1);
28
+ backdrop-filter: blur(5px);
29
+ padding: 1rem;
30
+ border-left: 4px solid #1f77b4;
31
+ border-radius: 8px;
32
+ margin: 0.5rem 0;
33
+ color: var(--text-color);
34
+ }
35
+
36
+ .share-link {
37
+ background: rgba(46, 204, 113, 0.1);
38
+ backdrop-filter: blur(5px);
39
+ padding: 1rem;
40
+ border-radius: 10px;
41
+ border-left: 4px solid #2ecc71;
42
+ color: var(--text-color);
43
+ }
44
+
45
+ /* Model indicator boxes */
46
+ .model-info {
47
+ background: rgba(52, 152, 219, 0.15);
48
+ backdrop-filter: blur(10px);
49
+ padding: 15px;
50
+ border-radius: 12px;
51
+ border-left: 4px solid #3498db;
52
+ margin: 10px 0;
53
+ }
54
+
55
+ /* Language selection enhancement */
56
+ .language-selection {
57
+ background: rgba(155, 89, 182, 0.1);
58
+ backdrop-filter: blur(10px);
59
+ padding: 15px;
60
+ border-radius: 12px;
61
+ border-left: 4px solid #9b59b6;
62
+ margin: 10px 0;
63
+ }
64
+
65
+ /* Upload area enhancement */
66
+ .stFileUploader {
67
+ background: rgba(230, 126, 34, 0.1);
68
+ backdrop-filter: blur(10px);
69
+ padding: 20px;
70
+ border-radius: 15px;
71
+ border: 2px dashed #e67e22;
72
+ }
73
+
74
+ .stFileUploader label {
75
+ font-size: 1.2rem;
76
+ font-weight: bold;
77
+ color: var(--text-color);
78
+ }
79
+
80
+ /* Button enhancements */
81
+ .stButton > button {
82
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
83
+ color: white;
84
+ border: none;
85
+ border-radius: 10px;
86
+ padding: 0.6rem 1.5rem;
87
+ font-weight: 600;
88
+ transition: all 0.3s ease;
89
+ box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
90
+ }
91
+
92
+ .stButton > button:hover {
93
+ transform: translateY(-2px);
94
+ box-shadow: 0 6px 20px 0 rgba(31, 38, 135, 0.5);
95
+ }
96
+
97
+ /* Sidebar enhancements */
98
+ .css-1d391kg {
99
+ background: rgba(255, 255, 255, 0.02);
100
+ backdrop-filter: blur(10px);
101
+ }
102
+
103
+ /* Info boxes */
104
+ .stInfo {
105
+ background: rgba(52, 152, 219, 0.1);
106
+ backdrop-filter: blur(10px);
107
+ border-left: 4px solid #3498db;
108
+ }
109
+
110
+ .stSuccess {
111
+ background: rgba(46, 204, 113, 0.1);
112
+ backdrop-filter: blur(10px);
113
+ border-left: 4px solid #2ecc71;
114
+ }
115
+
116
+ .stWarning {
117
+ background: rgba(241, 196, 15, 0.1);
118
+ backdrop-filter: blur(10px);
119
+ border-left: 4px solid #f1c40f;
120
+ }
121
+
122
+ .stError {
123
+ background: rgba(231, 76, 60, 0.1);
124
+ backdrop-filter: blur(10px);
125
+ border-left: 4px solid #e74c3c;
126
+ }
tabs/help.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Help tab functionality for the Gradio app
3
+ """
4
+ import gradio as gr
5
+
6
+ def create_help_tab(global_vars):
7
+ """Create the Help tab with comprehensive documentation"""
8
+ with gr.Tab("❓ Help", id="help"):
9
+ gr.Markdown("""
10
+ # 🌏 PANSEA University Requirements Assistant - User Guide
11
+
12
+ Welcome to the PANSEA (Pan-Southeast Asian) University Requirements Assistant! This tool helps you navigate university admission requirements across Southeast Asian countries using advanced AI-powered document analysis.
13
+
14
+ ---
15
+
16
+ ## 🚀 Getting Started
17
+
18
+ ### Step 1: Initialize the System
19
+ 1. Go to the **🔧 Initialize** tab
20
+ 2. Click **"Initialize All Systems"**
21
+ 3. Wait for the success message
22
+ 4. The system will set up AI models and document processing capabilities
23
+
24
+ ### Step 2: Upload Documents
25
+ 1. Navigate to the **📤 Upload Documents** tab
26
+ 2. Select one or more PDF files containing university requirement information
27
+ 3. Fill in the document metadata:
28
+ - **University Name**: Official name of the institution
29
+ - **Country**: Select from Southeast Asian countries
30
+ - **Document Type**: Choose the type of document
31
+ - **Language**: Document language
32
+ 4. Click **"Process Documents"**
33
+ 5. Wait for processing completion
34
+
35
+ ### Step 3: Query Documents
36
+ 1. Go to the **🔍 Query Documents** tab
37
+ 2. Type your question in the query box
38
+ 3. Click **"Search Documents"**
39
+ 4. Review the AI-generated answer and source references
40
+ 5. Use example questions to explore different types of queries
41
+
42
+ ### Step 4: Manage Documents
43
+ 1. Visit the **🗂 Manage Documents** tab
44
+ 2. View all uploaded documents and statistics
45
+ 3. Delete individual documents or clear all documents as needed
46
+
47
+ ---
48
+
49
+ ## 📖 Features Overview
50
+
51
+ ### 🤖 AI-Powered Analysis
52
+ - Uses advanced SEA-LION AI models optimized for Southeast Asian contexts
53
+ - Semantic search across your document collection
54
+ - Contextual answers with source citations
55
+ - Multi-language document support
56
+
57
+ ### 📚 Document Management
58
+ - Support for PDF documents
59
+ - Intelligent text chunking for better search results
60
+ - Metadata tracking (university, country, document type, language)
61
+ - Easy document deletion and management
62
+
63
+ ### 🌐 Regional Focus
64
+ - Specialized for Southeast Asian universities
65
+ - Supports multiple countries and languages
66
+ - Culturally aware responses
67
+ - Up-to-date admission requirement information
68
+
69
+ ---
70
+
71
+ ## 💡 Usage Tips
72
+
73
+ ### Asking Better Questions
74
+ - **Be Specific**: "What are the English proficiency requirements for Computer Science at NUS?" instead of "What are the requirements?"
75
+ - **Include Context**: Mention specific programs, countries, or universities you're interested in
76
+ - **Use Keywords**: Include terms like "admission", "requirements", "GPA", "test scores", etc.
77
+
78
+ ### Document Upload Best Practices
79
+ - **Quality Documents**: Upload official university brochures, requirement documents, or application guides
80
+ - **Accurate Metadata**: Fill in all metadata fields correctly for better search results
81
+ - **Regular Updates**: Replace outdated documents with current versions
82
+ - **Organized Approach**: Upload documents systematically by country or university
83
+
84
+ ### Managing Your Knowledge Base
85
+ - **Regular Maintenance**: Remove outdated documents periodically
86
+ - **Logical Organization**: Group related documents together
87
+ - **Backup Important Queries**: Save important answers for future reference
88
+
89
+ ---
90
+
91
+ ## 🛠 Troubleshooting
92
+
93
+ ### Common Issues
94
+
95
+ **Problem**: "Please initialize systems first" error
96
+ - **Solution**: Go to the Initialize tab and click "Initialize All Systems"
97
+
98
+ **Problem**: Document upload fails
99
+ - **Solution**: Ensure PDF files are not corrupted and contain text (not just images)
100
+
101
+ **Problem**: No search results
102
+ - **Solution**: Check if documents are uploaded and try different keywords
103
+
104
+ **Problem**: Slow performance
105
+ - **Solution**: Wait for processing to complete, avoid uploading too many large documents at once
106
+
107
+ ### Technical Requirements
108
+ - **File Format**: PDF documents only
109
+ - **File Size**: Reasonable size limits (avoid extremely large files)
110
+ - **Content**: Text-based PDFs work best (scanned images may not work well)
111
+ - **Internet**: Required for AI model access
112
+
113
+ ---
114
+
115
+ ## 📊 Understanding Results
116
+
117
+ ### Query Responses
118
+ - **Answer**: AI-generated response based on your documents
119
+ - **Sources**: Specific document chunks used to generate the answer
120
+ - **Confidence**: Implied by the specificity and detail of the response
121
+ - **Context**: Related information that might be helpful
122
+
123
+ ### Document Statistics
124
+ - **Total Documents**: Number of unique documents uploaded
125
+ - **Total Chunks**: Number of text segments for searching
126
+ - **Metadata**: Information about each document's origin and type
127
+
128
+ ---
129
+
130
+ ## 🌟 Best Practices for University Research
131
+
132
+ ### Research Strategy
133
+ 1. **Start Broad**: Upload general university information first
134
+ 2. **Get Specific**: Add detailed program requirements
135
+ 3. **Compare Options**: Query for comparisons between universities
136
+ 4. **Verify Information**: Cross-reference with official university websites
137
+
138
+ ### Question Types to Try
139
+ - **Admission Requirements**: "What are the minimum GPA requirements for..."
140
+ - **Test Scores**: "What IELTS/TOEFL scores are needed for..."
141
+ - **Application Deadlines**: "When is the application deadline for..."
142
+ - **Program Details**: "What courses are included in the... program at..."
143
+ - **Scholarships**: "What scholarship opportunities are available for..."
144
+
145
+ ---
146
+
147
+ ## 🆘 Support & Feedback
148
+
149
+ If you encounter issues or have suggestions for improvement:
150
+
151
+ 1. **Check Documentation**: Review this help section first
152
+ 2. **Try Different Approaches**: Rephrase your queries or check document formats
153
+ 3. **Document Issues**: Note specific error messages or unexpected behavior
154
+ 4. **Feature Requests**: Consider what additional functionality would be helpful
155
+
156
+ ---
157
+
158
+ ## 🔄 Version Information
159
+
160
+ **Current Version**: Gradio-based PANSEA Assistant
161
+ **AI Models**: SEA-LION optimized for Southeast Asian contexts
162
+ **Document Processing**: Advanced semantic chunking and embedding
163
+ **Search Technology**: Vector similarity search with contextual ranking
164
+
165
+ ---
166
+
167
+ *Happy university hunting! 🎓 We hope this tool helps you find the perfect educational opportunity in Southeast Asia.*
168
+ """)
tabs/initialize.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Initialize tab functionality for the Gradio app
3
+ """
4
+ import gradio as gr
5
+ from utils.rag_system import DocumentIngestion, RAGSystem
6
+
7
+ def initialize_systems(global_vars):
8
+ """Initialize the RAG systems"""
9
+ try:
10
+ print("🚀 Initializing document ingestion system...")
11
+ global_vars['doc_ingestion'] = DocumentIngestion()
12
+ print("🚀 Initializing RAG system...")
13
+ global_vars['rag_system'] = RAGSystem()
14
+ return "✅ Systems initialized successfully! You can now upload documents."
15
+ except Exception as e:
16
+ error_msg = f"❌ Error initializing systems: {str(e)}\n\n"
17
+
18
+ if "sentence-transformers" in str(e):
19
+ error_msg += """
20
+ **Possible solutions:**
21
+ 1. Install sentence-transformers: `pip install sentence-transformers`
22
+ 2. Or provide OpenAI API key in environment variables
23
+ 3. Check that PyTorch is properly installed
24
+
25
+ **For deployment:**
26
+ - Ensure requirements.txt includes: sentence-transformers, torch, transformers
27
+ """
28
+ return error_msg
29
+
30
+ def create_initialize_tab(global_vars):
31
+ """Create the Initialize System tab"""
32
+ with gr.Tab("🚀 Initialize System", id="init"):
33
+ gr.Markdown("""
34
+ ### Step 1: Initialize the System
35
+ Click the button below to initialize the AI models and embedding systems.
36
+ This may take a few moments on first run as models are downloaded.
37
+ """)
38
+
39
+ init_btn = gr.Button(
40
+ "🚀 Initialize Systems",
41
+ variant="primary",
42
+ size="lg"
43
+ )
44
+
45
+ init_status = gr.Textbox(
46
+ label="Initialization Status",
47
+ interactive=False,
48
+ lines=8,
49
+ placeholder="Click 'Initialize Systems' to start..."
50
+ )
51
+
52
+ init_btn.click(
53
+ lambda: initialize_systems(global_vars),
54
+ outputs=init_status
55
+ )
tabs/manage.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manage documents tab functionality for the Gradio app
3
+ """
4
+ import gradio as gr
5
+
6
+ def manage_documents(global_vars):
7
+ """Manage uploaded documents - view, delete individual or all documents"""
8
+ doc_ingestion = global_vars.get('doc_ingestion')
9
+
10
+ if not doc_ingestion:
11
+ return "❌ Please initialize systems first!", "", ""
12
+
13
+ try:
14
+ vectorstore = doc_ingestion.load_existing_vectorstore()
15
+
16
+ if not vectorstore:
17
+ return "⚠️ No documents found. Upload documents first.", "", ""
18
+
19
+ # Get all documents from vectorstore
20
+ collection = vectorstore._collection
21
+ all_docs = collection.get(include=["metadatas", "documents"])
22
+ metadatas = all_docs["metadatas"]
23
+ ids = all_docs["ids"]
24
+ documents = all_docs["documents"]
25
+
26
+ # Group by file_id to show unique documents
27
+ doc_map = {}
28
+ for meta, doc_id, doc_text in zip(metadatas, ids, documents):
29
+ file_id = meta.get("file_id", doc_id)
30
+ if file_id not in doc_map:
31
+ doc_map[file_id] = {
32
+ "source": meta.get("source", "Unknown"),
33
+ "university": meta.get("university", "Unknown"),
34
+ "country": meta.get("country", "Unknown"),
35
+ "document_type": meta.get("document_type", "Unknown"),
36
+ "language": meta.get("language", "Unknown"),
37
+ "upload_timestamp": meta.get("upload_timestamp", "Unknown"),
38
+ "file_id": file_id,
39
+ "chunks": []
40
+ }
41
+ doc_map[file_id]["chunks"].append(doc_text)
42
+
43
+ if not doc_map:
44
+ return "ℹ️ No documents found in the system.", "", ""
45
+
46
+ # Create summary
47
+ total_documents = len(doc_map)
48
+ total_chunks = sum(len(info["chunks"]) for info in doc_map.values())
49
+
50
+ summary = f"""## 📊 Document Statistics
51
+
52
+ **Total Documents:** {total_documents}
53
+ **Total Text Chunks:** {total_chunks}
54
+ **Storage Status:** Active
55
+
56
+ ## 📚 Document List
57
+ """
58
+
59
+ # Create document list with details
60
+ document_list = ""
61
+ file_id_list = []
62
+
63
+ for i, (file_id, info) in enumerate(doc_map.items(), 1):
64
+ timestamp = info['upload_timestamp'][:19] if len(info['upload_timestamp']) > 19 else info['upload_timestamp']
65
+
66
+ document_list += f"""
67
+ **{i}. {info['source']}**
68
+ - University: {info['university']}
69
+ - Country: {info['country']}
70
+ - Type: {info['document_type']}
71
+ - Language: {info['language']}
72
+ - Chunks: {len(info['chunks'])}
73
+ - Uploaded: {timestamp}
74
+ - File ID: `{file_id}`
75
+
76
+ ---
77
+ """
78
+ file_id_list.append(file_id)
79
+
80
+ # Create dropdown options for individual deletion
81
+ file_options = [f"{info['source']} ({info['university']})" for info in doc_map.values()]
82
+
83
+ return summary, document_list, file_options
84
+
85
+ except Exception as e:
86
+ return f"❌ Error loading documents: {str(e)}", "", []
87
+
88
+ def delete_document(selected_file, current_doc_list, global_vars):
89
+ """Delete a specific document"""
90
+ doc_ingestion = global_vars.get('doc_ingestion')
91
+
92
+ if not doc_ingestion or not selected_file:
93
+ return "❌ Please select a document to delete.", current_doc_list
94
+
95
+ try:
96
+ vectorstore = doc_ingestion.load_existing_vectorstore()
97
+ if not vectorstore:
98
+ return "❌ No vectorstore found.", current_doc_list
99
+
100
+ # Get all documents and find the matching file_id
101
+ collection = vectorstore._collection
102
+ all_docs = collection.get(include=["metadatas"])
103
+ metadatas = all_docs["metadatas"]
104
+ ids = all_docs["ids"]
105
+
106
+ # Find file_id for the selected document
107
+ target_file_id = None
108
+ for meta, doc_id in zip(metadatas, ids):
109
+ source = meta.get("source", "Unknown")
110
+ university = meta.get("university", "Unknown")
111
+ if f"{source} ({university})" == selected_file:
112
+ target_file_id = meta.get("file_id", doc_id)
113
+ break
114
+
115
+ if not target_file_id:
116
+ return "❌ Document not found.", current_doc_list
117
+
118
+ # Delete all chunks with this file_id
119
+ ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == target_file_id]
120
+ collection.delete(ids=ids_to_delete)
121
+
122
+ # Refresh the document list
123
+ _, new_doc_list, _ = manage_documents(global_vars)
124
+
125
+ return f"✅ Successfully deleted document: {selected_file}", new_doc_list
126
+
127
+ except Exception as e:
128
+ return f"❌ Error deleting document: {str(e)}", current_doc_list
129
+
130
+ def delete_all_documents(global_vars):
131
+ """Delete all documents from the vectorstore"""
132
+ doc_ingestion = global_vars.get('doc_ingestion')
133
+
134
+ if not doc_ingestion:
135
+ return "❌ Please initialize systems first.", ""
136
+
137
+ try:
138
+ vectorstore_instance = doc_ingestion.load_existing_vectorstore()
139
+ if not vectorstore_instance:
140
+ return "⚠️ No documents found to delete.", ""
141
+
142
+ # Get all document IDs
143
+ collection = vectorstore_instance._collection
144
+ all_docs = collection.get()
145
+ all_ids = all_docs["ids"]
146
+
147
+ # Delete all documents
148
+ if all_ids:
149
+ collection.delete(ids=all_ids)
150
+ # Clear global vectorstore
151
+ global_vars['vectorstore'] = None
152
+ return f"✅ Successfully deleted all {len(all_ids)} document chunks.", ""
153
+ else:
154
+ return "ℹ️ No documents found to delete.", ""
155
+
156
+ except Exception as e:
157
+ return f"❌ Error deleting all documents: {str(e)}", ""
158
+
159
+ def create_manage_tab(global_vars):
160
+ """Create the Manage Documents tab"""
161
+ with gr.Tab("🗂 Manage Documents", id="manage"):
162
+ gr.Markdown("""
163
+ ### Step 4: Manage Your Documents
164
+ View, inspect, and manage all uploaded documents in your knowledge base.
165
+ You can see document details and delete individual documents or all documents.
166
+ """)
167
+
168
+ # Buttons for actions
169
+ with gr.Row():
170
+ refresh_btn = gr.Button("🔄 Refresh Document List", variant="secondary")
171
+ delete_all_btn = gr.Button("🗑️ Delete All Documents", variant="stop")
172
+
173
+ # Document statistics and list
174
+ doc_summary = gr.Markdown(
175
+ value="📊 Click 'Refresh Document List' to view your documents.",
176
+ label="Document Summary"
177
+ )
178
+
179
+ doc_list = gr.Markdown(
180
+ value="📚 Document details will appear here after refresh.",
181
+ label="Document List"
182
+ )
183
+
184
+ # Individual document deletion
185
+ gr.Markdown("### 🗑️ Delete Individual Document")
186
+
187
+ with gr.Row():
188
+ file_selector = gr.Dropdown(
189
+ choices=[],
190
+ label="Select Document to Delete",
191
+ interactive=True,
192
+ info="First click 'Refresh Document List' to see available documents"
193
+ )
194
+ delete_single_btn = gr.Button("🗑️ Delete Selected", variant="stop")
195
+
196
+ delete_status = gr.Textbox(
197
+ label="Action Status",
198
+ interactive=False,
199
+ lines=2,
200
+ placeholder="Deletion status will appear here..."
201
+ )
202
+
203
+ # Event handlers
204
+ def refresh_documents():
205
+ summary, documents, file_options = manage_documents(global_vars)
206
+ # Update dropdown choices
207
+ return summary, documents, gr.Dropdown(choices=file_options, value=None)
208
+
209
+ def delete_selected_document(selected_file, current_list):
210
+ if not selected_file:
211
+ return "❌ Please select a document to delete first.", current_list, gr.Dropdown(choices=[])
212
+
213
+ status, new_list = delete_document(selected_file, current_list, global_vars)
214
+ # Also refresh the file options after deletion
215
+ _, _, new_options = manage_documents(global_vars)
216
+ return status, new_list, gr.Dropdown(choices=new_options, value=None)
217
+
218
+ def delete_all_docs():
219
+ status, empty_list = delete_all_documents(global_vars)
220
+ return status, "📚 No documents in the system.", gr.Dropdown(choices=[], value=None)
221
+
222
+ # Connect event handlers
223
+ refresh_btn.click(
224
+ refresh_documents,
225
+ outputs=[doc_summary, doc_list, file_selector]
226
+ )
227
+
228
+ delete_single_btn.click(
229
+ delete_selected_document,
230
+ inputs=[file_selector, doc_list],
231
+ outputs=[delete_status, doc_list, file_selector]
232
+ )
233
+
234
+ delete_all_btn.click(
235
+ delete_all_docs,
236
+ outputs=[delete_status, doc_list, file_selector]
237
+ )
tabs/query.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query documents tab functionality for the Gradio app
3
+ """
4
+ import gradio as gr
5
+
6
+ def query_documents(question, language, global_vars):
7
+ """Handle document queries"""
8
+ rag_system = global_vars.get('rag_system')
9
+ vectorstore = global_vars.get('vectorstore')
10
+
11
+ if not rag_system:
12
+ return "❌ Please initialize systems first using the 'Initialize System' tab!"
13
+
14
+ if not vectorstore:
15
+ return "❌ Please upload and process documents first using the 'Upload Documents' tab!"
16
+
17
+ if not question.strip():
18
+ return "❌ Please enter a question."
19
+
20
+ try:
21
+ print(f"🔍 Processing query: {question}")
22
+ result = rag_system.query(question, language)
23
+
24
+ # Format response
25
+ answer = result["answer"]
26
+ sources = result.get("source_documents", [])
27
+ model_used = result.get("model_used", "SEA-LION")
28
+
29
+ # Add model information
30
+ response = f"**Model Used:** {model_used}\n\n"
31
+ response += f"**Answer:**\n{answer}\n\n"
32
+
33
+ if sources:
34
+ response += "**📚 Sources:**\n"
35
+ for i, doc in enumerate(sources[:3], 1):
36
+ metadata = doc.metadata
37
+ source_name = metadata.get('source', 'Unknown')
38
+ university = metadata.get('university', 'Unknown')
39
+ country = metadata.get('country', 'Unknown')
40
+ doc_type = metadata.get('document_type', 'Unknown')
41
+
42
+ response += f"{i}. **{source_name}**\n"
43
+ response += f" - University: {university}\n"
44
+ response += f" - Country: {country}\n"
45
+ response += f" - Type: {doc_type}\n"
46
+ response += f" - Preview: {doc.page_content[:150]}...\n\n"
47
+ else:
48
+ response += "\n*No specific sources found. This might be a general response.*"
49
+
50
+ return response
51
+
52
+ except Exception as e:
53
+ return f"❌ Error querying documents: {str(e)}\n\nPlease check the console for more details."
54
+
55
+ def get_example_questions():
56
+ """Return example questions for the interface"""
57
+ return [
58
+ "What are the admission requirements for Computer Science programs in Singapore?",
59
+ "Which universities offer scholarships for international students?",
60
+ "What are the tuition fees for MBA programs in Thailand?",
61
+ "Find universities with engineering programs under $5000 per year",
62
+ "What are the application deadlines for programs in Malaysia?",
63
+ "Compare admission requirements between different ASEAN countries"
64
+ ]
65
+
66
+ def create_query_tab(global_vars):
67
+ """Create the Search & Query tab"""
68
+ with gr.Tab("🔍 Search & Query", id="query"):
69
+ gr.Markdown("""
70
+ ### Step 3: Ask Questions
71
+ Ask questions about the uploaded documents in your preferred language.
72
+ The AI will provide detailed answers with source citations.
73
+ """)
74
+
75
+ with gr.Row():
76
+ with gr.Column(scale=3):
77
+ question_input = gr.Textbox(
78
+ label="💭 Your Question",
79
+ placeholder="Ask anything about the universities...",
80
+ lines=3
81
+ )
82
+
83
+ with gr.Column(scale=1):
84
+ language_dropdown = gr.Dropdown(
85
+ choices=[
86
+ "English", "Chinese", "Malay", "Thai",
87
+ "Indonesian", "Vietnamese", "Filipino"
88
+ ],
89
+ value="English",
90
+ label="🌍 Response Language"
91
+ )
92
+
93
+ query_btn = gr.Button(
94
+ "🔍 Search Documents",
95
+ variant="primary",
96
+ size="lg"
97
+ )
98
+
99
+ answer_output = gr.Textbox(
100
+ label="🤖 AI Response",
101
+ interactive=False,
102
+ lines=20,
103
+ placeholder="Ask a question to get AI-powered answers..."
104
+ )
105
+
106
+ # Example questions section
107
+ gr.Markdown("### 💡 Example Questions")
108
+ example_questions = get_example_questions()
109
+
110
+ with gr.Row():
111
+ for i in range(0, len(example_questions), 2):
112
+ with gr.Column():
113
+ if i < len(example_questions):
114
+ example_btn = gr.Button(
115
+ example_questions[i],
116
+ size="sm",
117
+ variant="secondary"
118
+ )
119
+ example_btn.click(
120
+ lambda x=example_questions[i]: x,
121
+ outputs=question_input
122
+ )
123
+
124
+ if i + 1 < len(example_questions):
125
+ example_btn2 = gr.Button(
126
+ example_questions[i + 1],
127
+ size="sm",
128
+ variant="secondary"
129
+ )
130
+ example_btn2.click(
131
+ lambda x=example_questions[i + 1]: x,
132
+ outputs=question_input
133
+ )
134
+
135
+ query_btn.click(
136
+ lambda question, language: query_documents(question, language, global_vars),
137
+ inputs=[question_input, language_dropdown],
138
+ outputs=answer_output
139
+ )
tabs/upload.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload documents tab functionality for the Gradio app
3
+ """
4
+ import gradio as gr
5
+
6
+ def upload_documents(files, global_vars):
7
+ """Handle document upload and processing"""
8
+ doc_ingestion = global_vars.get('doc_ingestion')
9
+
10
+ if not doc_ingestion:
11
+ return "❌ Please initialize systems first using the 'Initialize System' tab!"
12
+
13
+ if not files:
14
+ return "❌ Please upload at least one PDF file."
15
+
16
+ try:
17
+ # Filter for PDF files only
18
+ pdf_files = []
19
+ for file_path in files:
20
+ if file_path.endswith('.pdf'):
21
+ pdf_files.append(file_path)
22
+
23
+ if not pdf_files:
24
+ return "❌ Please upload PDF files only."
25
+
26
+ print(f"📄 Processing {len(pdf_files)} PDF file(s)...")
27
+
28
+ # Process documents
29
+ documents = doc_ingestion.process_documents(pdf_files)
30
+
31
+ if documents:
32
+ print("🔗 Creating vector store...")
33
+ # Create vector store
34
+ vectorstore = doc_ingestion.create_vector_store(documents)
35
+
36
+ if vectorstore:
37
+ # Store vectorstore in global vars
38
+ global_vars['vectorstore'] = vectorstore
39
+
40
+ # Create summary
41
+ summary = f"✅ Successfully processed {len(documents)} document(s):\n\n"
42
+
43
+ for i, doc in enumerate(documents, 1):
44
+ metadata = doc.metadata
45
+ university = metadata.get('university', 'Unknown')
46
+ country = metadata.get('country', 'Unknown')
47
+ doc_type = metadata.get('document_type', 'Unknown')
48
+ language = metadata.get('language', 'Unknown')
49
+
50
+ summary += f"{i}. **{metadata['source']}**\n"
51
+ summary += f" - University: {university}\n"
52
+ summary += f" - Country: {country}\n"
53
+ summary += f" - Type: {doc_type}\n"
54
+ summary += f" - Language: {language}\n\n"
55
+
56
+ summary += "🎉 **Ready for queries!** Go to the 'Search & Query' tab to start asking questions."
57
+ return summary
58
+ else:
59
+ return "❌ Failed to create vector store from documents."
60
+ else:
61
+ return "❌ No documents were successfully processed. Please check if your PDFs are readable."
62
+
63
+ except Exception as e:
64
+ return f"❌ Error processing documents: {str(e)}\n\nPlease check the console for more details."
65
+
66
+ def create_upload_tab(global_vars):
67
+ """Create the Upload Documents tab"""
68
+ with gr.Tab("📄 Upload Documents", id="upload"):
69
+ gr.Markdown("""
70
+ ### Step 2: Upload PDF Documents
71
+ Upload university documents (brochures, admission guides, etc.) in PDF format.
72
+ The system will automatically extract metadata including university name, country, and document type.
73
+ """)
74
+
75
+ file_upload = gr.File(
76
+ label="📁 Upload PDF Documents",
77
+ file_types=[".pdf"],
78
+ file_count="multiple",
79
+ height=120
80
+ )
81
+
82
+ upload_btn = gr.Button(
83
+ "📄 Process Documents",
84
+ variant="primary",
85
+ size="lg"
86
+ )
87
+
88
+ upload_status = gr.Textbox(
89
+ label="Processing Status",
90
+ interactive=False,
91
+ lines=12,
92
+ placeholder="Upload PDF files and click 'Process Documents'..."
93
+ )
94
+
95
+ upload_btn.click(
96
+ lambda files: upload_documents(files, global_vars),
97
+ inputs=file_upload,
98
+ outputs=upload_status
99
+ )
utils/display.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils.rag_system import DocumentIngestion, RAGSystem, save_query_result, load_shared_query
3
+
4
+ def display_query_result(result, show_share_link=False):
5
+ """Display query results in a formatted way."""
6
+ st.markdown('<div class="query-result">', unsafe_allow_html=True)
7
+
8
+ # Show which model was used
9
+ if result.get("model_used"):
10
+ st.info(f"🤖 **Model Used:** {result['model_used']}")
11
+
12
+ st.subheader("🎯 Answer")
13
+ st.write(result["answer"])
14
+
15
+ # Share link
16
+ if show_share_link and result.get("query_id"):
17
+ st.markdown("---")
18
+ current_url = st.get_option("browser.serverAddress") or "localhost:8501"
19
+ share_url = f"http://{current_url}?share={result['query_id']}"
20
+ st.markdown(f"""
21
+ <div class="share-link">
22
+ <strong>🔗 Share this result:</strong><br>
23
+ <code>{share_url}</code>
24
+ </div>
25
+ """, unsafe_allow_html=True)
26
+
27
+ if st.button("📋 Copy Share Link"):
28
+ st.code(share_url)
29
+
30
+ # Source documents
31
+ if result.get("source_documents"):
32
+ st.markdown("---")
33
+ st.subheader("📚 Sources")
34
+ for i, doc in enumerate(result["source_documents"], 1):
35
+ with st.expander(f"Source {i}: {doc.metadata.get('source', 'Unknown')}"):
36
+ col1, col2 = st.columns([1, 2])
37
+ with col1:
38
+ st.write(f"**University:** {doc.metadata.get('university', 'Unknown')}")
39
+ st.write(f"**Country:** {doc.metadata.get('country', 'Unknown')}")
40
+ st.write(f"**Type:** {doc.metadata.get('document_type', 'Unknown')}")
41
+ with col2:
42
+ st.write("**Relevant Content:**")
43
+ content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
44
+ st.write(content_preview)
45
+
46
+ st.markdown('</div>', unsafe_allow_html=True)
47
+
48
+ def display_shared_query(query_id):
49
+ """Display a shared query result."""
50
+ st.header("🔗 Shared Query Result")
51
+
52
+ result_data = load_shared_query(query_id)
53
+
54
+ if result_data:
55
+ st.info(f"**Original Question:** {result_data['question']}")
56
+ st.write(f"**Language:** {result_data['language']}")
57
+ st.write(f"**Date:** {result_data['timestamp'][:10]}")
58
+
59
+ # Create a mock result object for display
60
+ mock_result = {
61
+ "answer": result_data["answer"],
62
+ "source_documents": [
63
+ type('MockDoc', (), {
64
+ 'metadata': source,
65
+ 'page_content': source.get('content_preview', '')
66
+ })() for source in result_data.get('sources', [])
67
+ ]
68
+ }
69
+
70
+ display_query_result(mock_result, show_share_link=False)
71
+
72
+ if st.button("🔍 Ask Your Own Question"):
73
+ st.experimental_set_query_params()
74
+ st.experimental_rerun()
75
+ else:
76
+ st.error("❌ Shared query not found or has expired.")
77
+ if st.button("🏠 Go to Home"):
78
+ st.experimental_set_query_params()
79
+ st.experimental_rerun()
utils/rag_system.py ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import tempfile
4
+ from typing import List, Optional, Dict, Any
5
+ from pathlib import Path
6
+ import PyPDF2
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
9
+ from langchain_community.vectorstores import Chroma
10
+ from langchain.chains import RetrievalQA
11
+ from langchain_community.document_loaders import PyPDFLoader
12
+ from langchain.schema import Document
13
+ from dotenv import load_dotenv
14
+
15
+ from datetime import datetime
16
+ import json
17
+ import base64
18
+ from openai import OpenAI
19
+ import re
20
+ from semantic_chunking import SemanticChunker
21
+
22
+ # Load environment variables
23
+ load_dotenv()
24
+
25
+ class AlternativeEmbeddings:
26
+ """Alternative embeddings using Sentence Transformers when OpenAI is not available"""
27
+
28
+ def __init__(self):
29
+ self.model = None
30
+ self.embedding_size = 384
31
+
32
+ try:
33
+ from sentence_transformers import SentenceTransformer
34
+
35
+ # Try smaller models in order of preference for better cloud compatibility
36
+ model_options = [
37
+ ("all-MiniLM-L6-v2", 384), # Very small and reliable
38
+ ("paraphrase-MiniLM-L3-v2", 384), # Even smaller
39
+ ("BAAI/bge-small-en-v1.5", 384) # Original choice
40
+ ]
41
+
42
+ for model_name, embed_size in model_options:
43
+ try:
44
+ print(f"🔄 Trying to load model: {model_name}")
45
+ self.model = SentenceTransformer(model_name)
46
+ self.embedding_size = embed_size
47
+ print(f"✅ Successfully loaded: {model_name}")
48
+ break
49
+ except Exception as e:
50
+ print(f"⚠️ Failed to load {model_name}: {str(e)}")
51
+ continue
52
+
53
+ if not self.model:
54
+ raise Exception("All embedding models failed to load")
55
+
56
+ except ImportError:
57
+ print("❌ sentence-transformers not available. Please install it or provide OpenAI API key.")
58
+ raise ImportError("sentence-transformers not available")
59
+
60
+ def embed_documents(self, texts):
61
+ if not self.model:
62
+ raise Exception("No embedding model available")
63
+ try:
64
+ return self.model.encode(texts, convert_to_numpy=True).tolist()
65
+ except Exception as e:
66
+ print(f"Error encoding documents: {e}")
67
+ raise
68
+
69
+ def embed_query(self, text):
70
+ if not self.model:
71
+ raise Exception("No embedding model available")
72
+ try:
73
+ return self.model.encode([text], convert_to_numpy=True)[0].tolist()
74
+ except Exception as e:
75
+ print(f"Error encoding query: {e}")
76
+ raise
77
+
78
+ class SEALionLLM:
79
+ """Custom LLM class for SEA-LION models"""
80
+
81
+ def __init__(self):
82
+ self.client = OpenAI(
83
+ api_key=os.getenv("SEA_LION_API_KEY"),
84
+ base_url=os.getenv("SEA_LION_BASE_URL", "https://api.sea-lion.ai/v1")
85
+ )
86
+
87
+ # Model configurations
88
+ self.instruct_model = "aisingapore/Gemma-SEA-LION-v3-9B-IT"
89
+ self.reasoning_model = "aisingapore/Llama-SEA-LION-v3.5-8B-R"
90
+
91
+ def _is_complex_query(self, query: str) -> bool:
92
+ """Determine if query requires reasoning model or simple instruct model"""
93
+ # Keywords that indicate complex university search queries
94
+ complex_keywords = [
95
+ "university", "admission", "requirement", "tuition", "fee", "program", "course",
96
+ "degree", "master", "bachelor", "phd", "scholarship", "deadline", "application",
97
+ "budget", "under", "less than", "below", "compare", "recommend", "suggest",
98
+ "which", "what are the", "show me", "find me", "search for",
99
+ # Chinese keywords
100
+ "大学", "学费", "专业", "硕士", "学士", "博士", "申请", "要求", "奖学金",
101
+ # Malay keywords
102
+ "universiti", "yuran", "program", "ijazah", "syarat", "permohonan",
103
+ # Thai keywords
104
+ "มหาวิทยาลัย", "ค่าเล่าเรียน", "หลักสูตร", "ปริญญา", "เงื่อนไข",
105
+ # Indonesian keywords
106
+ "universitas", "biaya", "kuliah", "program", "sarjana", "persyaratan"
107
+ ]
108
+
109
+ # Check for multiple criteria (indicates complex search)
110
+ criteria_count = 0
111
+ query_lower = query.lower()
112
+
113
+ for keyword in complex_keywords:
114
+ if keyword.lower() in query_lower:
115
+ criteria_count += 1
116
+
117
+ # Also check for comparison words, numbers, conditions
118
+ comparison_patterns = [
119
+ r"under \$?\d+", r"less than \$?\d+", r"below \$?\d+", r"between \$?\d+ and \$?\d+",
120
+ r"不超过.*元", r"低于.*元", r"少于.*元", # Chinese
121
+ r"kurang dari", r"di bawah", # Malay/Indonesian
122
+ r"น้อยกว่า", r"ต่ำกว่า" # Thai
123
+ ]
124
+
125
+ for pattern in comparison_patterns:
126
+ if re.search(pattern, query_lower):
127
+ criteria_count += 2
128
+
129
+ # Complex query if multiple keywords or comparison patterns found
130
+ return criteria_count >= 2
131
+
132
+ def _is_translation_query(self, query: str) -> bool:
133
+ """Check if query is primarily for translation"""
134
+ translation_keywords = [
135
+ "translate", "translation", "แปล", "翻译", "terjemah", "traduire"
136
+ ]
137
+
138
+ query_lower = query.lower()
139
+ return any(keyword in query_lower for keyword in translation_keywords)
140
+
141
+ def generate_response(self, query: str, context: str = "", language: str = "English") -> str:
142
+ """Generate response using appropriate SEA-LION model"""
143
+
144
+ # Choose model based on query complexity
145
+ if self._is_translation_query(query) or not self._is_complex_query(query):
146
+ model = self.instruct_model
147
+ use_reasoning = False
148
+ else:
149
+ model = self.reasoning_model
150
+ use_reasoning = True
151
+
152
+ # Prepare messages
153
+ system_prompt = f"""You are a helpful assistant specializing in ASEAN university admissions.
154
+ Respond in {language} unless specifically asked otherwise.
155
+
156
+ If provided with context from university documents, use that information to give accurate, specific answers.
157
+ Always cite your sources when using provided context.
158
+
159
+ For complex university search queries, provide:
160
+ 1. Direct answers to the question
161
+ 2. Relevant admission requirements
162
+ 3. Tuition fees (if available)
163
+ 4. Application deadlines (if available)
164
+ 5. Source citations from the documents
165
+
166
+ Context: {context}"""
167
+
168
+ messages = [
169
+ {"role": "system", "content": system_prompt},
170
+ {"role": "user", "content": query}
171
+ ]
172
+
173
+ try:
174
+ if use_reasoning:
175
+ # Use reasoning model with thinking mode
176
+ response = self.client.chat.completions.create(
177
+ model=model,
178
+ messages=messages,
179
+ max_tokens=2000,
180
+ temperature=0.1,
181
+ extra_body={"thinking_mode": True}
182
+ )
183
+ else:
184
+ # Use instruct model for simpler queries
185
+ response = self.client.chat.completions.create(
186
+ model=model,
187
+ messages=messages,
188
+ max_tokens=1500,
189
+ temperature=0.3
190
+ )
191
+
192
+ # Strip out reasoning steps from the response
193
+ response_text = response.choices[0].message.content
194
+ if "</think>" in response_text:
195
+ response_text = response_text.split("</think>")[-1].strip()
196
+
197
+ return response_text
198
+
199
+ except Exception as e:
200
+ print(f"Error with SEA-LION model: {str(e)}")
201
+ return f"I apologize, but I encountered an error processing your query. Please try rephrasing your question. Error: {str(e)}"
202
+
203
+ def extract_metadata(self, document_text: str) -> Dict[str, str]:
204
+ """Extract metadata from document text using LLM"""
205
+
206
+ system_prompt = """You are an expert at extracting metadata from university documents.
207
+ Analyze the provided document text and extract the following information:
208
+
209
+ 1. University name (full official name)
210
+ 2. Country (where the university is located)
211
+ 3. Document type (choose from: admission_requirements, tuition_fees, program_information, scholarship_info, application_deadlines, general_info)
212
+ 4. Language (choose from: English, Chinese, Malay, Thai, Indonesian, Vietnamese, Filipino)
213
+
214
+ Return your response as a JSON object with these exact keys:
215
+ {
216
+ "university_name": "extracted university name or \'Unknown\' if not found",
217
+ "country": "extracted country or \'Unknown\' if not found",
218
+ "document_type": "most appropriate document type from the list above",
219
+ "language": "detected language of the document"
220
+ }
221
+
222
+ Guidelines:
223
+ - For university_name: Look for official university names, avoid abbreviations when possible
224
+ - For country: Look for country names, city names that indicate country, or domain extensions
225
+ - For document_type: Analyze the content to determine what type of information it contains
226
+ - For language: Determine the primary language of the document.
227
+ - If information is unclear, use "Unknown" for university_name and country
228
+ - Always choose one of the specified document_type options and language options
229
+ """
230
+
231
+ messages = [
232
+ {"role": "system", "content": system_prompt},
233
+ {"role": "user", "content": f"Extract metadata from this document text:\n\n{document_text}"}
234
+ ]
235
+
236
+ try:
237
+ response = self.client.chat.completions.create(
238
+ model=self.instruct_model,
239
+ messages=messages,
240
+ max_tokens=500,
241
+ temperature=0.1
242
+ )
243
+
244
+ response_text = response.choices[0].message.content.strip()
245
+ print("--- DEBUG: LLM Metadata Extraction Details ---")
246
+ print(f"**Input Text for LLM (first 2 pages):**\n```\n{document_text[:1000]}...\n```") # Show first 1000 chars of input
247
+ print(f"**Raw LLM Response:**\n```json\n{response_text}\n```")
248
+
249
+ json_match = re.search(r'\{.*?\}', response_text, re.DOTALL)
250
+ if json_match:
251
+ json_str = json_match.group(0)
252
+ try:
253
+ metadata = json.loads(json_str)
254
+ print(f"**Parsed JSON Metadata:**\n```json\n{json.dumps(metadata, indent=2)}\n```")
255
+ required_keys = ["university_name", "country", "document_type", "language"]
256
+ if all(key in metadata for key in required_keys):
257
+ print("DEBUG: Successfully extracted and parsed metadata from LLM.")
258
+ return metadata
259
+ else:
260
+ print("DEBUG: LLM response missing required keys, attempting fallback or using defaults.")
261
+ return self._get_default_metadata()
262
+ except json.JSONDecodeError as e:
263
+ print(f"DEBUG: JSON Parsing Failed: {e}")
264
+ print(f"DEBUG: Attempting fallback text extraction from raw response.")
265
+ return self._extract_from_text_response(response_text)
266
+ else:
267
+ print("DEBUG: No JSON object found in LLM response.")
268
+ return self._extract_from_text_response(response_text)
269
+
270
+ except Exception as e:
271
+ print(f"DEBUG: Error during LLM Metadata Extraction: {str(e)}")
272
+ return self._get_default_metadata()
273
+
274
+ def _extract_from_text_response(self, response_text: str) -> Dict[str, str]:
275
+ """Fallback method to extract metadata from non-JSON LLM response"""
276
+ metadata = self._get_default_metadata()
277
+ lines = response_text.split("\n")
278
+ for line in lines:
279
+ line = line.strip()
280
+ if "university" in line.lower() and ":" in line:
281
+ value = line.split(":", 1)[1].strip().strip('",')
282
+ metadata["university_name"] = value
283
+ elif "country" in line.lower() and ":" in line:
284
+ value = line.split(":", 1)[1].strip().strip('",')
285
+ metadata["country"] = value
286
+ elif "document_type" in line.lower() and ":" in line:
287
+ value = line.split(":", 1)[1].strip().strip('",')
288
+ metadata["document_type"] = value
289
+ elif "language" in line.lower() and ":" in line:
290
+ value = line.split(":", 1)[1].strip().strip('",')
291
+ metadata["language"] = value
292
+ print(f"DEBUG: Fallback text extraction result: {metadata}")
293
+ return metadata
294
+
295
+ def _get_default_metadata(self) -> Dict[str, str]:
296
+ """Return default metadata when extraction fails"""
297
+ return {
298
+ "university_name": "Unknown",
299
+ "country": "Unknown",
300
+ "document_type": "general_info",
301
+ "language": "Unknown"
302
+ }
303
+
304
+ def classify_query_type(query: str) -> str:
305
+ """Public function to classify query type for UI display"""
306
+ # Create a temporary SEALionLLM instance just for classification
307
+ temp_llm = SEALionLLM()
308
+
309
+ if temp_llm._is_translation_query(query) or not temp_llm._is_complex_query(query):
310
+ return "simple"
311
+ else:
312
+ return "complex"
313
+
314
+ class DocumentIngestion:
315
+ def __init__(self):
316
+ # Initialize SEA-LION LLM for metadata extraction
317
+ self.sea_lion_llm = SEALionLLM()
318
+
319
+ # Use BGE embeddings by default for better performance
320
+ try:
321
+ self.embeddings = AlternativeEmbeddings()
322
+ self.embedding_type = "BGE-small-en"
323
+ if not self.embeddings.model:
324
+ raise Exception("BGE model not available")
325
+ except Exception:
326
+ # Fallback to OpenAI if BGE not available
327
+ openai_key = os.getenv("OPENAI_API_KEY")
328
+ if openai_key and openai_key != "placeholder_for_embeddings" and openai_key != "your_openai_api_key_here":
329
+ try:
330
+ self.embeddings = OpenAIEmbeddings()
331
+ self.embedding_type = "OpenAI"
332
+ except Exception as e:
333
+ print("Both BGE and OpenAI embeddings failed. Please check your setup.")
334
+ raise e
335
+ else:
336
+ print("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
337
+ raise Exception("No embedding model available")
338
+
339
+ self.text_splitter = SemanticChunker(
340
+ embeddings_model=self.embeddings,
341
+ chunk_size=4, # 4 sentences per base chunk
342
+ overlap=1, # 1 sentence overlap
343
+ similarity_threshold=0.75, # Semantic similarity threshold
344
+ min_chunk_size=150, # Minimum 150 characters
345
+ max_chunk_size=1500, # Maximum 1500 characters
346
+ debug=True # Show statistics in Streamlit
347
+ )
348
+
349
+ # st.info(f"🧠 Using semantic chunking with {self.embedding_type} embeddings") # Commented out as it\'s a Streamlit call
350
+ self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
351
+ os.makedirs(self.persist_directory, exist_ok=True)
352
+
353
+ def extract_text_from_pdf(self, pdf_file_path) -> List[str]:
354
+ """Extract text from PDF file path with multiple fallback methods."""
355
+ try:
356
+ # Method 1: Try with PyPDF2 (handles most PDFs including encrypted ones with PyCryptodome)
357
+ with open(pdf_file_path, 'rb') as pdf_file:
358
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
359
+
360
+ # Check if PDF is encrypted
361
+ if pdf_reader.is_encrypted:
362
+ # Try to decrypt with empty password (common for protected but not password-protected PDFs)
363
+ try:
364
+ pdf_reader.decrypt("")
365
+ except Exception:
366
+ print(f"PDF {os.path.basename(pdf_file_path)} is password-protected. Please provide an unprotected version.")
367
+ return [] # Return empty list for password-protected PDFs
368
+
369
+ text_per_page = []
370
+ for page_num, page in enumerate(pdf_reader.pages):
371
+ try:
372
+ page_text = page.extract_text()
373
+ text_per_page.append(page_text)
374
+ except Exception as e:
375
+ print(f"Could not extract text from page {page_num + 1} of {os.path.basename(pdf_file_path)}: {str(e)}")
376
+ text_per_page.append("") # Append empty string for failed pages
377
+
378
+ if any(text.strip() for text in text_per_page):
379
+ return text_per_page
380
+ else:
381
+ print(f"No extractable text found in {os.path.basename(pdf_file_path)}. This might be a scanned PDF or image-based document.")
382
+ return []
383
+
384
+ except Exception as e:
385
+ error_msg = str(e)
386
+ if "PyCryptodome" in error_msg:
387
+ print(f"Encryption error with {os.path.basename(pdf_file_path)}: {error_msg}")
388
+ print("💡 The PDF uses encryption. PyCryptodome has been installed to handle this.")
389
+ elif "password" in error_msg.lower():
390
+ print(f"Password-protected PDF: {os.path.basename(pdf_file_path)}")
391
+ print("💡 Please provide an unprotected version of this PDF.")
392
+ else:
393
+ print(f"Error extracting text from {os.path.basename(pdf_file_path)}: {error_msg}")
394
+ return []
395
+
396
+ def process_documents(self, pdf_file_paths) -> List[Document]:
397
+ """Process PDF file paths and convert to documents with automatic metadata extraction."""
398
+ documents = []
399
+ processed_count = 0
400
+ failed_count = 0
401
+
402
+ print(f"📄 Processing {len(pdf_file_paths)} document(s) with automatic metadata detection...") # Changed to print
403
+
404
+ for pdf_file_path in pdf_file_paths:
405
+ if pdf_file_path.endswith('.pdf'):
406
+ filename = os.path.basename(pdf_file_path)
407
+ print(f"🔍 Extracting text from: **{filename}**") # Changed to print
408
+
409
+ # Extract text per page
410
+ text_per_page = self.extract_text_from_pdf(pdf_file_path)
411
+ print(f"DEBUG: Extracted {len(text_per_page)} pages from {filename}")
412
+
413
+ if text_per_page:
414
+ # Combine first two pages for metadata extraction
415
+ text_for_metadata = "\n".join(text_per_page[:2])
416
+ print(f"DEBUG: Text for metadata extraction (first 500 chars): {text_for_metadata[:500]}")
417
+ # Extract metadata using LLM
418
+ print(f"🤖 Detecting metadata for: **{filename}**") # Changed to print
419
+ extracted_metadata = self.sea_lion_llm.extract_metadata(text_for_metadata)
420
+
421
+ # Create metadata
422
+ metadata = {
423
+ "source": filename,
424
+ "university": extracted_metadata.get("university_name", "Unknown"),
425
+ "country": extracted_metadata.get("country", "Unknown"),
426
+ "document_type": extracted_metadata.get("document_type", "general_info"),
427
+ "language": extracted_metadata.get("language", "Unknown"), # Added language
428
+ "upload_timestamp": datetime.now().isoformat(),
429
+ "file_id": str(uuid.uuid4())
430
+ }
431
+
432
+ # Create document
433
+ doc = Document(
434
+ page_content="\n".join(text_per_page), # Use all pages for document content
435
+ metadata=metadata
436
+ )
437
+ documents.append(doc)
438
+ processed_count += 1
439
+ print(f"✅ Successfully processed: **{filename}** ({len(doc.page_content)} characters)") # Changed to print
440
+ else:
441
+ failed_count += 1
442
+ print(f"⚠️ Could not extract text from **{filename}**") # Changed to print
443
+ else:
444
+ failed_count += 1
445
+ filename = os.path.basename(pdf_file_path)
446
+ print(f"❌ Unsupported file type for {filename} (expected .pdf)") # Changed to print
447
+
448
+ # Summary
449
+ if processed_count > 0:
450
+ print(f"🎉 Successfully processed **{processed_count}** document(s)") # Changed to print
451
+ if failed_count > 0:
452
+ print(f"⚠️ Failed to process **{failed_count}** document(s)") # Changed to print
453
+
454
+ return documents
455
+
456
+ def create_vector_store(self, documents: List[Document]) -> Chroma:
457
+ """Create and persist vector store from documents."""
458
+ if not documents:
459
+ print("No documents to process") # Changed to print
460
+ return None
461
+
462
+ # Split documents into chunks
463
+ texts = self.text_splitter.split_documents(documents)
464
+
465
+ # Create vector store
466
+ vectorstore = Chroma.from_documents(
467
+ documents=texts,
468
+ embedding=self.embeddings,
469
+ persist_directory=self.persist_directory
470
+ )
471
+
472
+ return vectorstore
473
+
474
+ def load_existing_vectorstore(self) -> Optional[Chroma]:
475
+ """Load existing vector store if it exists."""
476
+ try:
477
+ vectorstore = Chroma(
478
+ persist_directory=self.persist_directory,
479
+ embedding_function=self.embeddings
480
+ )
481
+ return vectorstore
482
+ except Exception as e:
483
+ print(f"Could not load existing vector store: {str(e)}") # Changed to print
484
+ return None
485
+
486
+ class RAGSystem:
487
+ def __init__(self):
488
+ # Initialize embeddings - try BGE first, fallback to OpenAI
489
+ try:
490
+ self.embeddings = AlternativeEmbeddings()
491
+ if not self.embeddings.model:
492
+ # Fallback to OpenAI if BGE not available
493
+ self.embeddings = OpenAIEmbeddings()
494
+ except Exception:
495
+ # If both fail, use OpenAI as last resort
496
+ self.embeddings = OpenAIEmbeddings()
497
+
498
+ self.sea_lion_llm = SEALionLLM()
499
+ self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
500
+
501
+ def get_vectorstore(self) -> Optional[Chroma]:
502
+ """Get the vector store."""
503
+ try:
504
+ vectorstore = Chroma(
505
+ persist_directory=self.persist_directory,
506
+ embedding_function=self.embeddings
507
+ )
508
+ return vectorstore
509
+ except Exception as e:
510
+ print(f"Error loading vector store: {str(e)}")
511
+ return None
512
+
513
+ def query(self, question: str, language: str = "English") -> Dict[str, Any]:
514
+ """Query the RAG system using SEA-LION models."""
515
+ vectorstore = self.get_vectorstore()
516
+ # if not vectorstore:
517
+ # return {
518
+ # "answer": "No documents have been ingested yet. Please upload some PDF documents first.",
519
+ # "source_documents": [],
520
+ # "query_id": None
521
+ # }
522
+
523
+ try:
524
+ # Retrieve relevant documents
525
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
526
+ relevant_docs = retriever.get_relevant_documents(question)
527
+
528
+ # Prepare context from retrieved documents
529
+ context_parts = []
530
+ for i, doc in enumerate(relevant_docs, 1):
531
+ source_info = doc.metadata.get('source', 'Unknown')
532
+ university = doc.metadata.get('university', 'Unknown')
533
+ country = doc.metadata.get('country', 'Unknown')
534
+
535
+ context_parts.append(f"""
536
+ Document {i} (Source: {source_info}, University: {university}, Country: {country}):
537
+ {doc.page_content[:500]}...
538
+ """)
539
+
540
+ context = "\n".join(context_parts)
541
+
542
+ # Generate response using SEA-LION model
543
+ answer = self.sea_lion_llm.generate_response(
544
+ query=question,
545
+ context=context,
546
+ language=language
547
+ )
548
+
549
+ # Generate query ID for sharing
550
+ query_id = str(uuid.uuid4())
551
+
552
+ return {
553
+ "answer": answer,
554
+ "source_documents": relevant_docs,
555
+ "query_id": query_id,
556
+ "original_question": question,
557
+ "language": language,
558
+ "model_used": "SEA-LION" + (" Reasoning" if self.sea_lion_llm._is_complex_query(question) else " Instruct")
559
+ }
560
+
561
+ except Exception as e:
562
+ print(f"Error querying system: {str(e)}")
563
+ return {
564
+ "answer": f"Error processing your question: {str(e)}",
565
+ "source_documents": [],
566
+ "query_id": None
567
+ }
568
+
569
+ def save_query_result(query_result: Dict[str, Any]):
570
+ """Save query result for sharing."""
571
+ if query_result.get("query_id"):
572
+ results_dir = "query_results"
573
+ os.makedirs(results_dir, exist_ok=True)
574
+
575
+ result_file = f"{results_dir}/{query_result['query_id']}.json"
576
+
577
+ # Prepare data for saving (remove non-serializable objects)
578
+ save_data = {
579
+ "query_id": query_result["query_id"],
580
+ "question": query_result.get("original_question", ""),
581
+ "answer": query_result["answer"],
582
+ "language": query_result.get("language", "English"),
583
+ "timestamp": datetime.now().isoformat(),
584
+ "sources": [
585
+ {
586
+ "source": doc.metadata.get("source", "Unknown"),
587
+ "university": doc.metadata.get("university", "Unknown"),
588
+ "country": doc.metadata.get("country", "Unknown"),
589
+ "content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
590
+ }
591
+ for doc in query_result.get("source_documents", [])
592
+ ]
593
+ }
594
+
595
+ try:
596
+ with open(result_file, 'w', encoding='utf-8') as f:
597
+ json.dump(save_data, f, indent=2, ensure_ascii=False)
598
+ return True
599
+ except Exception as e:
600
+ print(f"Error saving query result: {str(e)}")
601
+ return False
602
+ return False
603
+
604
+ def load_shared_query(query_id: str) -> Optional[Dict[str, Any]]:
605
+ """Load a shared query result."""
606
+ result_file = f"query_results/{query_id}.json"
607
+
608
+ if os.path.exists(result_file):
609
+ try:
610
+ with open(result_file, 'r', encoding='utf-8') as f:
611
+ return json.load(f)
612
+ except Exception as e:
613
+ print(f"Error loading shared query: {str(e)}")
614
+
615
+ return None
utils/translations.py ADDED
@@ -0,0 +1,753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ translations = {
2
+ "English": {
3
+ # Navigation
4
+ "search_universities": "🔍 Search Universities",
5
+ "upload_documents": "📄 Upload Documents",
6
+ "manage_documents": "🗂 Manage Documents",
7
+ "about": "ℹ️ About Top.Edu",
8
+ "navigation": "🎯 Navigation",
9
+
10
+ # Main header
11
+ "app_title": "🎓 Top.Edu",
12
+ "app_subtitle": "Unlock ASEAN Education with AI-Powered Search",
13
+
14
+ # Search page
15
+ "search_header": "🔍 Search University Information",
16
+ "search_description": "Ask about admissions, fees, scholarships, and programs:",
17
+ "language_label": "Response Language",
18
+ "your_question": "Your question:",
19
+ "placeholder_text": "e.g., Master's in Malaysia under 40,000 RMB/year",
20
+ "example_queries": "💡 See Example Queries",
21
+ "complex_queries": "🧠 Complex Queries (Uses Reasoning Model)",
22
+ "simple_queries": "⚡ Simple Queries (Uses Instruct Model)",
23
+ "advanced_filters": "🔧 Advanced Filters (Optional)",
24
+ "budget_range": "Budget Range (Local Currency/Year)",
25
+ "study_level": "Study Level",
26
+ "preferred_countries": "Preferred Countries",
27
+ "search_button": "🔍 Search",
28
+ "ready_to_search": "✅ Ready to search! Click the search button when you're ready.",
29
+ "enter_question": "💭 Enter your question in the text box above to start searching.",
30
+ "using_example": "📝 Using example:",
31
+ "responses_in": "🌐 Responses will be in",
32
+
33
+ # Upload page
34
+ "upload_header": "📄 Upload University Documents",
35
+ "upload_description": "Upload official PDF documents containing university admission requirements, fees, and program information.",
36
+ "university_name": "University Name",
37
+ "country": "Country",
38
+ "document_type": "Document Type",
39
+ "choose_files": "Choose PDF files",
40
+ "drag_drop": "Drag and drop files here",
41
+ "file_limit": "Limit 200MB per file • PDF",
42
+ "browse_files": "Browse files",
43
+ "process_documents": "🚀 Process Documents",
44
+ "processing_docs": "📄 Processing document(s)...",
45
+ "successfully_processed": "🎉 Successfully processed",
46
+ "failed_to_process": "⚠️ Failed to process",
47
+ "documents": "document(s)",
48
+ "no_docs_processed": "No documents were successfully processed.",
49
+
50
+ # Document types
51
+ "admission_requirements": "Admission Requirements",
52
+ "tuition_fees": "Tuition Fees & Costs",
53
+ "program_information": "Program Information",
54
+ "scholarship_info": "Scholarship Information",
55
+ "application_deadlines": "Application Deadlines",
56
+ "general_info": "General Information",
57
+
58
+ # Manage documents page
59
+ "manage_header": "🗂 Manage Documents",
60
+ "manage_description": "View and manage uploaded university documents in your knowledge base.",
61
+ "total_documents": "Total Documents",
62
+ "total_chunks": "Total Text Chunks",
63
+ "storage_size": "Storage Size",
64
+ "last_updated": "Last Updated",
65
+ "document_list": "📚 Document List",
66
+ "no_documents": "No documents found. Upload some documents first!",
67
+ "delete_all": "🗑️ Delete All Documents",
68
+ "confirm_delete": "⚠️ Are you sure you want to delete ALL documents? This cannot be undone.",
69
+ "yes_delete": "Yes, Delete All",
70
+ "documents_deleted": "All documents have been deleted.",
71
+
72
+ # About page
73
+ "about_header": "About Top.Edu",
74
+ "what_we_do": "🎯 What We Do",
75
+ "what_we_do_description": "Top.Edu helps students worldwide easily find accurate and up-to-date information on universities in Southeast Asia. Our platform aggregates official university documents and uses AI to answer questions about programs, tuition, entry requirements, and application deadlines.",
76
+ "who_we_are": "💡 Who We Are",
77
+ "who_we_are_description": "We are a team of education and AI enthusiasts dedicated to making higher education accessible and transparent. Our mission is to simplify the search process and provide trustworthy guidance to students looking to study abroad.",
78
+ "contact": "📞 Contact & Support",
79
+ "supported_languages": "🌏 Supported Languages",
80
+
81
+ # Countries
82
+ "singapore": "Singapore",
83
+ "malaysia": "Malaysia",
84
+ "thailand": "Thailand",
85
+ "indonesia": "Indonesia",
86
+ "philippines": "Philippines",
87
+ "vietnam": "Vietnam",
88
+ "brunei": "Brunei",
89
+
90
+ # Study levels
91
+ "diploma": "Diploma",
92
+ "bachelor": "Bachelor",
93
+ "master": "Master",
94
+ "phd": "PhD",
95
+
96
+ # Budget options
97
+ "any": "Any",
98
+ "under_10k": "<10k",
99
+ "10k_20k": "10k-20k",
100
+ "20k_30k": "20k-30k",
101
+ "30k_40k": "30k-40k",
102
+ "over_40k": ">40k",
103
+
104
+ # Example queries
105
+ "example_complex_1": "Show me universities in Malaysia for master's degrees with tuition under 40,000 RMB per year",
106
+ "example_complex_2": "Compare engineering programs in Thailand and Singapore under $15,000 per year",
107
+ "example_complex_3": "Find MBA programs in ASEAN with GMAT requirements and scholarships available",
108
+ "example_complex_4": "Universities in Indonesia with English-taught programs and no IELTS requirement",
109
+ "example_simple_1": "What does IELTS stand for?",
110
+ "example_simple_2": "What is the difference between bachelor and master degree?",
111
+ "example_simple_3": "How to apply for student visa?",
112
+ "example_simple_4": "What documents are needed for university application?",
113
+
114
+ # System messages
115
+ "systems_initialized": "✅ Systems initialized successfully!",
116
+ "can_upload_documents": "You can now upload documents.",
117
+ "initialization_error": "Error initializing systems",
118
+ "installation_help": """**Possible solutions:**
119
+ 1. Install sentence-transformers: `pip install sentence-transformers`
120
+ 2. Or provide OpenAI API key in environment variables
121
+ 3. Check that PyTorch is properly installed
122
+
123
+ **For deployment:**
124
+ - Ensure requirements.txt includes: sentence-transformers, torch, transformers""",
125
+ "please_initialize_first": "Please initialize systems first using the 'Initialize System' tab!",
126
+ "please_upload_pdf": "Please upload at least one PDF file.",
127
+ "upload_pdf_only": "Please upload PDF files only.",
128
+ "successfully_processed_docs": "Successfully processed",
129
+ "failed_create_vectorstore": "Failed to create vector store from documents.",
130
+ "no_docs_successfully_processed": "No documents were successfully processed. Please check if your PDFs are readable.",
131
+ "error_processing_docs": "Error processing documents",
132
+ "check_console": "Please check the console for more details.",
133
+ "please_upload_process_first": "Please upload and process documents first using the 'Upload Documents' tab!",
134
+ "please_enter_question": "Please enter a question.",
135
+ "processing_query": "Processing query",
136
+ "model_used": "Model Used",
137
+ "answer": "Answer",
138
+ "sources": "Sources",
139
+ "no_sources_found": "No specific sources found. This might be a general response.",
140
+ "error_querying_docs": "Error querying documents",
141
+ "ready_for_queries": "Ready for queries! Go to the 'Search & Query' tab to start asking questions.",
142
+
143
+ # Interface elements
144
+ "initialize_system": "Initialize System",
145
+ "initialize_systems": "Initialize Systems",
146
+ "initialization_status": "Initialization Status",
147
+ },
148
+
149
+ "中文": {
150
+ # Navigation
151
+ "search_universities": "🔍 搜索大学",
152
+ "upload_documents": "📄 上传文档",
153
+ "manage_documents": "🗂 管理文档",
154
+ "about": "ℹ️ 关于Top.Edu",
155
+ "navigation": "🎯 导航",
156
+
157
+ # Main header
158
+ "app_title": "🎓 Top.Edu",
159
+ "app_subtitle": "用AI驱动的搜索解锁东盟教育机会",
160
+
161
+ # Search page
162
+ "search_header": "🔍 搜索大学信息",
163
+ "search_description": "询问关于入学要求、学费、奖学金和专业项目:",
164
+ "language_label": "回复语言",
165
+ "your_question": "您的问题:",
166
+ "placeholder_text": "例如:马来西亚硕士学位,学费低于4万人民币/年",
167
+ "example_queries": "💡 查看示例问题",
168
+ "complex_queries": "🧠 复杂查询(使用推理模型)",
169
+ "simple_queries": "⚡ 简单查询(使用指令模型)",
170
+ "advanced_filters": "🔧 高级筛选(可选)",
171
+ "budget_range": "预算范围(当地货币/年)",
172
+ "study_level": "学历层次",
173
+ "preferred_countries": "首选国家",
174
+ "search_button": "🔍 搜索",
175
+ "ready_to_search": "✅ 准备搜索!准备好后点击搜索按钮。",
176
+ "enter_question": "💭 在上面的文本框中输入您的问题开始搜索。",
177
+ "using_example": "📝 使用示例:",
178
+ "responses_in": "🌐 回复将使用",
179
+
180
+ # Upload page
181
+ "upload_header": "📄 上传大学文档",
182
+ "upload_description": "上传包含大学入学要求、学费和专业信息的官方PDF文档。",
183
+ "university_name": "大学名称",
184
+ "country": "国家",
185
+ "document_type": "文档类型",
186
+ "choose_files": "选择PDF文件",
187
+ "drag_drop": "将文件拖放到此处",
188
+ "file_limit": "限制每个文件200MB • PDF",
189
+ "browse_files": "浏览文件",
190
+ "process_documents": "🚀 处理文档",
191
+ "processing_docs": "📄 正在处理文档...",
192
+ "successfully_processed": "🎉 成功处理",
193
+ "failed_to_process": "⚠️ 处理失败",
194
+ "documents": "个文档",
195
+ "no_docs_processed": "没有成���处理任何文档。",
196
+
197
+ # Document types
198
+ "admission_requirements": "入学要求",
199
+ "tuition_fees": "学费和费用",
200
+ "program_information": "专业信息",
201
+ "scholarship_info": "奖学金信息",
202
+ "application_deadlines": "申请截止日期",
203
+ "general_info": "一般信息",
204
+
205
+ # Manage documents page
206
+ "manage_header": "🗂 管理文档",
207
+ "manage_description": "查看和管理您知识库中上传的大学文档。",
208
+ "total_documents": "文档总数",
209
+ "total_chunks": "文本块总数",
210
+ "storage_size": "存储大小",
211
+ "last_updated": "最后更新",
212
+ "document_list": "📚 文档列表",
213
+ "no_documents": "未找到文档。请先上传一些文档!",
214
+ "delete_all": "🗑️ 删除所有文档",
215
+ "confirm_delete": "⚠️ 您确定要删除所有文档吗?此操作无法撤消。",
216
+ "yes_delete": "是的,删除全部",
217
+ "documents_deleted": "所有文档已被删除。",
218
+
219
+ # About page
220
+ "about_header": "关于 Top.Edu",
221
+ "what_we_do": "🎯 我们的工作",
222
+ "what_we_do_description": "Top.Edu 帮助全球学生轻松获取东南亚高校的准确且最新的信息。我们的平台整合官方大学文件,并利用 AI 回答有关课程、学费、入学要求和申请截止日期的问题。",
223
+ "who_we_are": "💡 我们是谁",
224
+ "who_we_are_description": "我们是一支热衷教育与 AI 的团队,致力于让高等教育变得更透明可及。我们的使命是简化搜索流程,为希望出国留学的学生提供可靠指导。",
225
+ "contact": "📞 联系与支持",
226
+ "supported_languages": "🌏 支持语言",
227
+
228
+ # Countries
229
+ "singapore": "新加坡",
230
+ "malaysia": "马来西亚",
231
+ "thailand": "泰国",
232
+ "indonesia": "印度尼西亚",
233
+ "philippines": "菲律宾",
234
+ "vietnam": "越南",
235
+ "brunei": "文莱",
236
+
237
+ # Study levels
238
+ "diploma": "文凭",
239
+ "bachelor": "学士",
240
+ "master": "硕士",
241
+ "phd": "博士",
242
+
243
+ # Budget options
244
+ "any": "任意",
245
+ "under_10k": "<1万",
246
+ "10k_20k": "1-2万",
247
+ "20k_30k": "2-3万",
248
+ "30k_40k": "3-4万",
249
+ "over_40k": ">4万",
250
+
251
+ # Example queries
252
+ "example_complex_1": "为我推荐马来西亚学费低于4万人民币/年的硕士学位项目",
253
+ "example_complex_2": "比较泰国和新加坡学费低于1.5万美元/年的工程专业",
254
+ "example_complex_3": "寻找东盟地区有GMAT要求和奖学金的MBA项目",
255
+ "example_complex_4": "印尼有英语授课且无需雅思的大学项目",
256
+ "example_simple_1": "IELTS是什么意思?",
257
+ "example_simple_2": "学士学位和硕士学位有什么区别?",
258
+ "example_simple_3": "如何申请学生签证?",
259
+ "example_simple_4": "大学申请需要哪些文件?",
260
+
261
+ # System messages
262
+ "systems_initialized": "✅ 系统初始化成功!",
263
+ "can_upload_documents": "您现在可以上传文档。",
264
+ "initialization_error": "系统初始化错误",
265
+ "installation_help": """**可能的解决方案:**
266
+ 1. 安装 sentence-transformers: `pip install sentence-transformers`
267
+ 2. 或在环境变量中提供 OpenAI API 密钥
268
+ 3. 检查 PyTorch 是否正确安装
269
+
270
+ **部署时:**
271
+ - 确保 requirements.txt 包含:sentence-transformers, torch, transformers""",
272
+ "please_initialize_first": "请先使用'初始化系统'选项卡初始化系统!",
273
+ "please_upload_pdf": "请至少上传一个PDF文件。",
274
+ "upload_pdf_only": "请仅上传PDF文件。",
275
+ "successfully_processed_docs": "成功处理",
276
+ "failed_create_vectorstore": "创建向量存储失败。",
277
+ "no_docs_successfully_processed": "没有成功处理任何文档。请检查您的PDF是否可读。",
278
+ "error_processing_docs": "处理文档时出错",
279
+ "check_console": "请查看控制台获取更多详细信息。",
280
+ "please_upload_process_first": "请先使用'上传文档'选项卡上传和处理文档!",
281
+ "please_enter_question": "请输入问题。",
282
+ "processing_query": "正在处理查询",
283
+ "model_used": "使用的模型",
284
+ "answer": "答案",
285
+ "sources": "来源",
286
+ "no_sources_found": "未找到特定来源。这可能是一般性回答。",
287
+ "error_querying_docs": "查询文档时出错",
288
+ "ready_for_queries": "准备查询!前往'搜索与查询'选项卡开始提问。",
289
+
290
+ # Interface elements
291
+ "initialize_system": "初始化系统",
292
+ "initialize_systems": "初始化系统",
293
+ "initialization_status": "初始化状态",
294
+ },
295
+
296
+ "Malay": {
297
+ # Navigation
298
+ "search_universities": "🔍 Cari Universiti",
299
+ "upload_documents": "📄 Muat Naik Dokumen",
300
+ "manage_documents": "🗂 Urus Dokumen",
301
+ "about": "ℹ️ Mengenai Top.Edu",
302
+ "navigation": "🎯 Navigasi",
303
+
304
+ # Main header
305
+ "app_title": "🎓 Top.Edu",
306
+ "app_subtitle": "Buka Pendidikan ASEAN dengan Carian Berkuasa AI",
307
+
308
+ # Search page
309
+ "search_header": "🔍 Cari Maklumat Universiti",
310
+ "search_description": "Tanya tentang kemasukan, yuran, biasiswa, dan program:",
311
+ "language_label": "Bahasa Respons",
312
+ "your_question": "Soalan anda:",
313
+ "placeholder_text": "cth: Ijazah Sarjana di Malaysia di bawah 40,000 RMB/tahun",
314
+ "example_queries": "💡 Lihat Contoh Soalan",
315
+ "complex_queries": "🧠 Soalan Kompleks (Menggunakan Model Penaakulan)",
316
+ "simple_queries": "⚡ Soalan Mudah (Menggunakan Model Arahan)",
317
+ "advanced_filters": "🔧 Penapis Lanjutan (Pilihan)",
318
+ "budget_range": "Julat bajet (mata wang tempatan/tahun)",
319
+ "study_level": "Tahap Pengajian",
320
+ "preferred_countries": "Negara Pilihan",
321
+ "search_button": "🔍 Cari",
322
+ "ready_to_search": "✅ Sedia untuk cari! Klik butang cari bila anda sedia.",
323
+ "enter_question": "💭 Masukkan soalan anda di kotak teks di atas untuk mula mencari.",
324
+ "using_example": "📝 Menggunakan contoh:",
325
+ "responses_in": "🌐 Respons akan dalam bahasa",
326
+
327
+ # Upload page
328
+ "upload_header": "📄 Muat Naik Dokumen Universiti",
329
+ "upload_description": "Muat naik dokumen PDF rasmi yang mengandungi keperluan kemasukan universiti, yuran, dan maklumat program.",
330
+ "university_name": "Nama Universiti",
331
+ "country": "Negara",
332
+ "document_type": "Jenis Dokumen",
333
+ "choose_files": "Pilih fail PDF",
334
+ "drag_drop": "Seret dan lepas fail di sini",
335
+ "file_limit": "Had 200MB setiap fail • PDF",
336
+ "browse_files": "Layari fail",
337
+ "process_documents": "🚀 Proses Dokumen",
338
+ "processing_docs": "📄 Memproses dokumen...",
339
+ "successfully_processed": "🎉 Berjaya diproses",
340
+ "failed_to_process": "⚠️ Gagal diproses",
341
+ "documents": "dokumen",
342
+ "no_docs_processed": "Tiada dokumen yang berjaya diproses.",
343
+
344
+ # Document types
345
+ "admission_requirements": "Keperluan Kemasukan",
346
+ "tuition_fees": "Yuran Pengajian & Kos",
347
+ "program_information": "Maklumat Program",
348
+ "scholarship_info": "Maklumat Biasiswa",
349
+ "application_deadlines": "Tarikh Tutup Permohonan",
350
+ "general_info": "Maklumat Umum",
351
+
352
+ # Manage documents page
353
+ "manage_header": "🗂 Urus Dokumen",
354
+ "manage_description": "Lihat dan urus dokumen universiti yang dimuat naik dalam pangkalan pengetahuan anda.",
355
+ "total_documents": "Jumlah Dokumen",
356
+ "total_chunks": "Jumlah Bahagian Teks",
357
+ "storage_size": "Saiz Storan",
358
+ "last_updated": "Kemaskini Terakhir",
359
+ "document_list": "📚 Senarai Dokumen",
360
+ "no_documents": "Tiada dokumen dijumpai. Muat naik beberapa dokumen dahulu!",
361
+ "delete_all": "🗑️ Padam Semua Dokumen",
362
+ "confirm_delete": "⚠️ Adakah anda pasti mahu memadam SEMUA dokumen? Tindakan ini tidak boleh dibatalkan.",
363
+ "yes_delete": "Ya, Padam Semua",
364
+ "documents_deleted": "Semua dokumen telah dipadam.",
365
+
366
+ # About page
367
+ "about_header": "Tentang Top.Edu",
368
+ "what_we_do": "🎯 Apa Yang Kami Lakukan",
369
+ "what_we_do_description": "Top.Edu membantu pelajar di seluruh dunia untuk mencari maklumat tepat dan terkini mengenai universiti di Asia Tenggara dengan mudah. Platform kami menggabungkan dokumen rasmi universiti dan menggunakan AI untuk menjawab soalan tentang program, yuran pengajian, syarat kemasukan, dan tarikh akhir permohonan.",
370
+ "who_we_are": "💡 Siapa Kami",
371
+ "who_we_are_description": "Kami adalah pasukan yang berminat dalam pendidikan dan AI, berdedikasi untuk menjadikan pendidikan tinggi lebih mudah diakses dan telus. Misi kami adalah mempermudah proses carian dan memberikan panduan yang boleh dipercayai kepada pelajar yang ingin belajar di luar negara.",
372
+ "contact": "📞 Hubungi & Sokongan",
373
+ "supported_languages": "🌏 Bahasa Disokong",
374
+
375
+ # Countries
376
+ "singapore": "Singapura",
377
+ "malaysia": "Malaysia",
378
+ "thailand": "Thailand",
379
+ "indonesia": "Indonesia",
380
+ "philippines": "Filipina",
381
+ "vietnam": "Vietnam",
382
+ "brunei": "Brunei",
383
+
384
+ # Study levels
385
+ "diploma": "Diploma",
386
+ "bachelor": "Sarjana Muda",
387
+ "master": "Sarjana",
388
+ "phd": "PhD",
389
+
390
+ # Budget options
391
+ "any": "Mana-mana",
392
+ "under_10k": "<10k",
393
+ "10k_20k": "10k-20k",
394
+ "20k_30k": "20k-30k",
395
+ "30k_40k": "30k-40k",
396
+ "over_40k": ">40k",
397
+
398
+ # Example queries
399
+ "example_complex_1": "Tunjukkan saya universiti di Malaysia untuk ijazah sarjana dengan yuran di bawah 40,000 RMB setahun",
400
+ "example_complex_2": "Bandingkan program kejuruteraan di Thailand dan Singapura di bawah $15,000 setahun",
401
+ "example_complex_3": "Cari program MBA di ASEAN dengan keperluan GMAT dan biasiswa tersedia",
402
+ "example_complex_4": "Universiti di Indonesia dengan program bahasa Inggeris dan tanpa keperluan IELTS",
403
+ "example_simple_1": "Apakah maksud IELTS?",
404
+ "example_simple_2": "Apakah perbezaan antara ijazah sarjana muda dan sarjana?",
405
+ "example_simple_3": "Bagaimana untuk memohon visa pelajar?",
406
+ "example_simple_4": "Dokumen apakah yang diperlukan untuk permohonan universiti?",
407
+ },
408
+
409
+ "ไทย": {
410
+ # Navigation
411
+ "search_universities": "🔍 ค้นหามหาวิทยาลัย",
412
+ "upload_documents": "📄 อัพโหลดเอกสาร",
413
+ "manage_documents": "🗂 จัดการเอกสาร",
414
+ "about": "ℹ️ เกี่ยวกับ Top.Edu",
415
+ "navigation": "🎯 เมนูหลัก",
416
+
417
+ # Main header
418
+ "app_title": "🎓 Top.Edu",
419
+ "app_subtitle": "ปลดล็อกการศึกษาอาเซียนด้วยการค้นหาที่ขับเคลื่อนด้วย AI",
420
+
421
+ # Search page
422
+ "search_header": "🔍 ค้นหาข้อมูลมหาวิทยาลัย",
423
+ "search_description": "ถามเกี่ยวกับการเข้าเรียน ค่าใช้จ่าย ทุนการศึกษา และหลักสูตร:",
424
+ "language_label": "ภาษาการตอบ",
425
+ "your_question": "คำถามของคุณ:",
426
+ "placeholder_text": "เช่น ปริญญาโทในมาเลเซียต่ำกว่า 40,000 หยวนจีน/ปี",
427
+ "example_queries": "💡 ดูตัวอย่างคำถาม",
428
+ "complex_queries": "🧠 คำถามซับซ้อน (ใช้โมเดลการให้เหตุผล)",
429
+ "simple_queries": "⚡ คำถามง่าย (ใช้โมเดลคำสั่ง)",
430
+ "advanced_filters": "🔧 ตัวกรองขั้นสูง (ตัวเลือก)",
431
+ "budget_range": "ช่วงงบประมาณ (สกุลเงินท้องถิ่น/ปี)",
432
+ "study_level": "ระดับการศึกษา",
433
+ "preferred_countries": "ประเทศที่ต้องการ",
434
+ "search_button": "🔍 ค้นหา",
435
+ "ready_to_search": "✅ พร้อมค้นหา! คลิกปุ่มค้นหาเมื่อคุณพร้อม",
436
+ "enter_question": "💭 ใส่คำถามในกล่องข้อความด้านบนเพื่อเริ่มค้นหา",
437
+ "using_example": "📝 ใช้ตัวอย่าง:",
438
+ "responses_in": "🌐 จะตอบเป็นภาษา",
439
+
440
+ # Upload page
441
+ "upload_header": "📄 อัพโหลดเอกสารมหาวิทยาลัย",
442
+ "upload_description": "อัพโหลดเอกสาร PDF ที่มีข้อกำหนดการรับเข้า ค่าธรรมเนียม และข้อมูลหลักสูตร",
443
+ "university_name": "ชื่อมหาวิทยาลัย",
444
+ "country": "ประเทศ",
445
+ "document_type": "ประเภทเอกสาร",
446
+ "choose_files": "เลือกไฟล์ PDF",
447
+ "file_limit": "จำกัด 200MB ต่อไฟล์ • PDF",
448
+ "process_documents": "🚀 ประมวลผลเอกสาร",
449
+ "processing_docs": "📄 กำลังประมวลผลเอกสาร...",
450
+ "successfully_processed": "🎉 ประมวลผลสำเร็จ",
451
+ "failed_to_process": "⚠️ ประมวลผลล้มเหลว",
452
+ "documents": "เอกสาร",
453
+ "no_docs_processed": "ไม่สามารถประมวลผลเอกสารใดได้สำเร็จ",
454
+
455
+ # Document types
456
+ "admission_requirements": "ข้อกำหนดการรับเข้า",
457
+ "tuition_fees": "ค่าธรรมเนียมการศึกษา",
458
+ "program_information": "ข้อมูลหลักสูตร",
459
+ "scholarship_info": "ข้อมูลทุนการศึกษา",
460
+ "application_deadlines": "กำหนดส่งใบสมัคร",
461
+ "general_info": "ข้อมูลทั่วไป",
462
+
463
+ # Manage documents page
464
+ "manage_header": "🗂 จัดกา��เอกสาร",
465
+ "manage_description": "ดูและจัดการเอกสารมหาวิทยาลัยในฐานข้อมูลของคุณ",
466
+ "total_documents": "เอกสารทั้งหมด",
467
+ "total_chunks": "ส่วนข้อความทั้งหมด",
468
+ "storage_size": "ขนาดที่เก็บข้อมูล",
469
+ "last_updated": "อัปเดตล่าสุด",
470
+ "document_list": "📚 รายการเอกสาร",
471
+ "no_documents": "ไม่พบเอกสาร อัปโหลดเอกสารก่อน!",
472
+ "delete_all": "🗑️ ลบเอกสารทั้งหมด",
473
+ "documents_deleted": "เอกสารทั้งหมดถูกลบแล้ว",
474
+
475
+ # About page
476
+ "about_header": "เกี่ยวกับ Top.Edu",
477
+ "what_we_do": "🎯 สิ่งที่เราทำ",
478
+ "what_we_do_description": "Top.Edu ช่วยให้นักเรียนทั่วโลกสามารถค้นหาข้อมูลที่ถูกต้องและทันสมัยเกี่ยวกับมหาวิทยาลัยในเอเชียตะวันออกเฉียงใต้ได้อย่างง่ายดาย แพลตฟอร์มของเรารวบรวมเอกสารทางการของมหาวิทยาลัยและใช้ AI เพื่อตอบคำถามเกี่ยวกับหลักสูตร ค่าเล่าเรียน ข้อกำหนดการเข้าเรียน และกำหนดส่งใบสมัคร",
479
+ "who_we_are": "💡 เราคือใคร",
480
+ "who_we_are_description": "เราคือทีมผู้ที่ชื่นชอบการศึกษาและ AI มุ่งมั่นที่จะทำให้การศึกษาระดับอุดมศึกษาสามารถเข้าถึงได้และโปร่งใส ภารกิจของเราคือทำให้กระบวนการค้นหาง่ายขึ้นและให้คำแนะนำที่เชื่อถือได้แก่นักเรียนที่ต้องการศึกษาต่อต่างประเทศ",
481
+ "contact": "📞 ติดต่อ & สนับสนุน",
482
+ "supported_languages": "🌏 ภาษาที่รองรับ",
483
+
484
+ # Countries
485
+ "singapore": "สิงคโปร์",
486
+ "malaysia": "มาเลเซีย",
487
+ "thailand": "ไทย",
488
+ "indonesia": "อินโดนีเซีย",
489
+ "philippines": "ฟิลิปปินส์",
490
+ "vietnam": "เวียดนาม",
491
+ "brunei": "บรูไน",
492
+
493
+ # Study levels
494
+ "diploma": "ประกาศนียบัตร",
495
+ "bachelor": "ปริญญาตรี",
496
+ "master": "ปริญญาโท",
497
+ "phd": "ปริญญาเอก",
498
+
499
+ # Budget options
500
+ "any": "ใดก็ได้",
501
+ "under_10k": "<10k",
502
+ "10k_20k": "10k-20k",
503
+ "20k_30k": "20k-30k",
504
+ "30k_40k": "30k-40k",
505
+ "over_40k": ">40k",
506
+
507
+ # Example queries
508
+ "example_complex_1": "แสดงมหาวิทยาลัยในมาเลเซียสำหรับปริญญาโทที่มีค่าเล่าเรียนต่ำกว่า 40,000 หยวนต่อปี",
509
+ "example_complex_2": "เปรียบเทียบหลักสูตรวิศวกรรมในไทยและสิงคโปร์ต่ำกว่า $15,000 ต่อปี",
510
+ "example_complex_3": "ค้นหาหลักสูตร MBA ในอาเซียนที่มีข้อกำหนด GMAT และทุนการศึกษา",
511
+ "example_complex_4": "มหาวิทยาลัยในอินโดนีเซียที่มีหลักสูตรสอนภาษาอังกฤษและไม่ต้องการ IELTS",
512
+ "example_simple_1": "IELTS ย่อมาจากอะไร?",
513
+ "example_simple_2": "ความแตกต่างระหว่างปริญญาตรีและปริญญาโทคืออะไร?",
514
+ "example_simple_3": "วิธีสมัครวีซ่านักเรียนอย่างไร?",
515
+ "example_simple_4": "เอกสารอะไรบ้างที่จำเป็นสำหรับการสมัครเข้ามหาวิทยาลัย?",
516
+ },
517
+
518
+ "Indonesian": {
519
+ # Navigation
520
+ "search_universities": "�� Cari Universitas",
521
+ "upload_documents": "📄 Unggah Dokumen",
522
+ "manage_documents": "🗂 Kelola Dokumen",
523
+ "about": "ℹ️ Tentang Top.Edu",
524
+ "navigation": "🎯 Navigasi",
525
+
526
+ # Main header
527
+ "app_title": "🎓 Top.Edu",
528
+ "app_subtitle": "Buka Pendidikan ASEAN dengan Pencarian Bertenaga AI",
529
+
530
+ # Search page
531
+ "search_header": "🔍 Cari Informasi Universitas",
532
+ "search_description": "Tanya tentang penerimaan, biaya, beasiswa, dan program:",
533
+ "language_label": "Bahasa Respon",
534
+ "your_question": "Pertanyaan Anda:",
535
+ "placeholder_text": "mis: Magister di Malaysia di bawah 40.000 RMB/tahun",
536
+ "example_queries": "💡 Lihat Contoh Pertanyaan",
537
+ "complex_queries": "🧠 Pertanyaan Kompleks (Menggunakan Model Penalaran)",
538
+ "simple_queries": "⚡ Pertanyaan Sederhana (Menggunakan Model Instruksi)",
539
+ "advanced_filters": "🔧 Filter Lanjutan (Opsional)",
540
+ "budget_range": "Rentang anggaran (mata uang lokal/tahun)",
541
+ "study_level": "Tingkat Studi",
542
+ "preferred_countries": "Negara Pilihan",
543
+ "search_button": "🔍 Cari",
544
+ "ready_to_search": "✅ Siap mencari! Klik tombol cari saat Anda siap.",
545
+ "enter_question": "💭 Masukkan pertanyaan di kotak teks di atas untuk mulai mencari.",
546
+ "using_example": "📝 Menggunakan contoh:",
547
+ "responses_in": "🌐 Respon akan dalam bahasa",
548
+
549
+ # Upload page
550
+ "upload_header": "📄 Unggah Dokumen Universitas",
551
+ "upload_description": "Unggah dokumen PDF berisi persyaratan masuk universitas, biaya, dan informasi program.",
552
+ "university_name": "Nama Universitas",
553
+ "country": "Negara",
554
+ "document_type": "Jenis Dokumen",
555
+ "choose_files": "Pilih file PDF",
556
+ "file_limit": "Batas 200MB per file • PDF",
557
+ "process_documents": "🚀 Proses Dokumen",
558
+ "processing_docs": "📄 Memproses dokumen...",
559
+ "successfully_processed": "🎉 Berhasil diproses",
560
+ "failed_to_process": "⚠️ Gagal diproses",
561
+ "documents": "dokumen",
562
+ "no_docs_processed": "Tidak ada dokumen yang berhasil diproses.",
563
+
564
+ # Document types
565
+ "admission_requirements": "Persyaratan Masuk",
566
+ "tuition_fees": "Biaya Kuliah & Biaya",
567
+ "program_information": "Informasi Program",
568
+ "scholarship_info": "Informasi Beasiswa",
569
+ "application_deadlines": "Batas Waktu Aplikasi",
570
+ "general_info": "Informasi Umum",
571
+
572
+ # Manage documents page
573
+ "manage_header": "🗂 Kelola Dokumen",
574
+ "manage_description": "Lihat dan kelola dokumen universitas dalam basis pengetahuan Anda.",
575
+ "total_documents": "Total Dokumen",
576
+ "total_chunks": "Total Bagian Teks",
577
+ "storage_size": "Ukuran Penyimpanan",
578
+ "last_updated": "Terakhir Diperbarui",
579
+ "document_list": "📚 Daftar Dokumen",
580
+ "no_documents": "Tidak ada dokumen ditemukan. Unggah beberapa dokumen terlebih dahulu!",
581
+ "delete_all": "🗑️ Hapus Semua Dokumen",
582
+ "documents_deleted": "Semua dokumen telah dihapus.",
583
+
584
+ # About page
585
+ "about_header": "Tentang Top.Edu",
586
+ "what_we_do": "🎯 Apa Yang Kami Lakukan",
587
+ "what_we_do_description": "Top.Edu membantu mahasiswa di seluruh dunia dengan mudah menemukan informasi yang akurat dan terbaru tentang universitas di Asia Tenggara. Platform kami mengumpulkan dokumen resmi universitas dan menggunakan AI untuk menjawab pertanyaan tentang program, biaya kuliah, persyaratan masuk, dan tenggat waktu pendaftaran.",
588
+ "who_we_are": "💡 Siapa Kami",
589
+ "who_we_are_description": "Kami adalah tim yang bersemangat di bidang pendidikan dan AI, berdedikasi untuk membuat pendidikan tinggi lebih mudah diakses dan transparan. Misi kami adalah menyederhanakan proses pencarian dan memberikan panduan terpercaya bagi mahasiswa yang ingin belajar di luar negeri.",
590
+ "contact": "📞 Kontak & Dukungan",
591
+ "supported_languages": "🌏 Bahasa Didukung",
592
+
593
+ # Countries
594
+ "singapore": "Singapura",
595
+ "malaysia": "Malaysia",
596
+ "thailand": "Thailand",
597
+ "indonesia": "Indonesia",
598
+ "philippines": "Filipina",
599
+ "vietnam": "Vietnam",
600
+ "brunei": "Brunei",
601
+
602
+ # Study levels
603
+ "diploma": "Diploma",
604
+ "bachelor": "Sarjana",
605
+ "master": "Magister",
606
+ "phd": "Doktor",
607
+
608
+ # Budget options
609
+ "any": "Apa saja",
610
+ "under_10k": "<10k",
611
+ "10k_20k": "10k-20k",
612
+ "20k_30k": "20k-30k",
613
+ "30k_40k": "30k-40k",
614
+ "over_40k": ">40k",
615
+
616
+ # Example queries
617
+ "example_complex_1": "Tunjukkan universitas di Malaysia untuk gelar magister dengan biaya kuliah di bawah 40.000 RMB per tahun",
618
+ "example_complex_2": "Bandingkan program teknik di Thailand dan Singapura di bawah $15.000 per tahun",
619
+ "example_complex_3": "Cari program MBA di ASEAN dengan persyaratan GMAT dan beasiswa tersedia",
620
+ "example_complex_4": "Universitas di Indonesia dengan program berbahasa Inggris tanpa persyaratan IELTS",
621
+ "example_simple_1": "Apa kepanjangan dari IELTS?",
622
+ "example_simple_2": "Apa perbedaan antara gelar sarjana dan magister?",
623
+ "example_simple_3": "Bagaimana cara mengajukan visa pelajar?",
624
+ "example_simple_4": "Dokumen apa saja yang diperlukan untuk aplikasi universitas?",
625
+ },
626
+
627
+ "Tiếng Việt": {
628
+ # Navigation
629
+ "search_universities": "🔍 Tìm kiếm Trường đại học",
630
+ "upload_documents": "📄 Tải lên Tài liệu",
631
+ "manage_documents": "🗂 Quản lý Tài liệu",
632
+ "about": "ℹ️ Về Top.Edu",
633
+ "navigation": "🎯 Điều hướng",
634
+
635
+ # Main header
636
+ "app_title": "🎓 Top.Edu",
637
+ "app_subtitle": "Mở khóa Giáo dục ASEAN với Tìm kiếm được hỗ trợ bởi AI",
638
+
639
+ # Search page
640
+ "search_header": "🔍 Tìm kiếm Thông tin Trường đại học",
641
+ "search_description": "Hỏi về tuyển sinh, học phí, học bổng và chương trình:",
642
+ "language_label": "Ngôn ngữ Phản hồi",
643
+ "your_question": "Câu hỏi của bạn:",
644
+ "placeholder_text": "vd: Thạc sĩ tại Malaysia dưới 40.000 tệ/năm",
645
+ "example_queries": "💡 Xem Câu hỏi Mẫu",
646
+ "complex_queries": "🧠 Câu hỏi Phức tạp (Sử dụng Mô hình Lý luận)",
647
+ "simple_queries": "⚡ Câu hỏi Đơn giản (Sử dụng Mô hình Hướng dẫn)",
648
+ "advanced_filters": "🔧 Bộ lọc Nâng cao (Tùy chọn)",
649
+ "budget_range": "Khoảng ngân sách (tiền tệ địa phương/năm)",
650
+ "study_level": "Bậc học",
651
+ "preferred_countries": "Quốc gia Ưa thích",
652
+ "search_button": "🔍 Tìm kiếm",
653
+ "ready_to_search": "✅ Sẵn sàng tìm kiếm! Nhấp vào nút tìm kiếm khi bạn sẵn sàng.",
654
+ "enter_question": "💭 Nhập câu hỏi vào hộp văn bản ở trên để bắt đầu tìm kiếm.",
655
+ "using_example": "📝 Sử dụng ví dụ:",
656
+ "responses_in": "🌐 Phản hồi sẽ bằng tiếng",
657
+
658
+ # Upload page
659
+ "upload_header": "📄 Tải lên Tài liệu Đại học",
660
+ "upload_description": "Tải lên tài liệu PDF chứa yêu cầu tuyển sinh, học phí và thông tin chương trình của đại học.",
661
+ "university_name": "Tên Đại học",
662
+ "country": "Quốc gia",
663
+ "document_type": "Loại Tài liệu",
664
+ "choose_files": "Chọn file PDF",
665
+ "file_limit": "Giới hạn 200MB mỗi file • PDF",
666
+ "process_documents": "🚀 Xử lý Tài liệu",
667
+ "processing_docs": "📄 Đang xử lý tài liệu...",
668
+ "successfully_processed": "🎉 Xử lý thành công",
669
+ "failed_to_process": "⚠️ Xử lý thất bại",
670
+ "documents": "tài liệu",
671
+ "no_docs_processed": "Không có tài liệu nào được xử lý thành công.",
672
+
673
+ # Document types
674
+ "admission_requirements": "Yêu cầu Tuyển sinh",
675
+ "tuition_fees": "Học phí & Chi phí",
676
+ "program_information": "Thông tin Chương trình",
677
+ "scholarship_info": "Thông tin Học bổng",
678
+ "application_deadlines": "Hạn nộp Đơn",
679
+ "general_info": "Thông tin Chung",
680
+
681
+ # Manage documents page
682
+ "manage_header": "🗂 Quản lý Tài liệu",
683
+ "manage_description": "Xem và quản lý tài liệu đại học trong cơ sở tri thức của bạn.",
684
+ "total_documents": "Tổng Tài liệu",
685
+ "total_chunks": "Tổng Đoạn Văn bản",
686
+ "storage_size": "Kích thước Lưu trữ",
687
+ "last_updated": "Cập nhật Cuối",
688
+ "document_list": "📚 Danh sách Tài liệu",
689
+ "no_documents": "Không tìm thấy tài liệu. Hãy tải lên một số tài liệu trước!",
690
+ "delete_all": "🗑️ Xóa Tất cả Tài liệu",
691
+ "documents_deleted": "Tất cả tài liệu đã được xóa.",
692
+
693
+ # About page
694
+ "about_header": "Về Top.Edu",
695
+ "what_we_do": "🎯 Chúng tôi làm gì",
696
+ "what_we_do_description": "Top.Edu giúp sinh viên toàn cầu dễ dàng tìm thông tin chính xác và cập nhật về các trường đại học ở Đông Nam Á. Nền tảng của chúng tôi tổng hợp các tài liệu chính thức của trường và sử dụng AI để trả lời các câu hỏi về chương trình học, học phí, yêu cầu đầu vào và hạn nộp hồ sơ.",
697
+ "who_we_are": "💡 Chúng tôi là ai",
698
+ "who_we_are_description": "Chúng tôi là nhóm những người đam mê giáo dục và AI, cam kết làm cho giáo dục đại học trở nên dễ tiếp cận và minh bạch. Sứ mệnh của chúng tôi là đơn giản hóa quá trình tìm kiếm và cung cấp hướng dẫn đáng tin cậy cho sinh viên muốn du học.",
699
+ "contact": "📞 Liên hệ & Hỗ trợ",
700
+ "supported_languages": "🌏 Ngôn ngữ được hỗ trợ",
701
+ # Countries
702
+ "singapore": "Singapore",
703
+ "malaysia": "Malaysia",
704
+ "thailand": "Thái Lan",
705
+ "indonesia": "Indonesia",
706
+ "philippines": "Philippines",
707
+ "vietnam": "Việt Nam",
708
+ "brunei": "Brunei",
709
+
710
+ # Study levels
711
+ "diploma": "Chứng chỉ",
712
+ "bachelor": "Cử nhân",
713
+ "master": "Thạc sĩ",
714
+ "phd": "Tiến sĩ",
715
+
716
+ # Budget options
717
+ "any": "Bất kỳ",
718
+ "under_10k": "<10k",
719
+ "10k_20k": "10k-20k",
720
+ "20k_30k": "20k-30k",
721
+ "30k_40k": "30k-40k",
722
+ "over_40k": ">40k",
723
+
724
+ # Example queries
725
+ "example_complex_1": "Cho tôi xem các trường đại học ở Malaysia cho bằng thạc sĩ với học phí dưới 40.000 tệ/năm",
726
+ "example_complex_2": "So sánh các chương trình kỹ thuật ở Thái Lan và Singapore dưới $15.000/năm",
727
+ "example_complex_3": "Tìm các chương trình MBA ở ASEAN có yêu cầu GMAT và học bổng",
728
+ "example_complex_4": "Các trường đại học ở Indonesia với chương trình giảng dạy bằng tiếng Anh không yêu cầu IELTS",
729
+ "example_simple_1": "IELTS là viết tắt của gì?",
730
+ "example_simple_2": "Sự khác biệt giữa bằng cử nhân và thạc sĩ là gì?",
731
+ "example_simple_3": "Làm thế nào để xin visa du học?",
732
+ "example_simple_4": "Những tài liệu gì cần thiết cho đơn xin học đại học?",
733
+ }
734
+ }
735
+
736
+ def get_text(key: str, lang: str = "English") -> str:
737
+ """Get translated text for a given key and language"""
738
+ if lang in translations and key in translations[lang]:
739
+ return translations[lang][key]
740
+ # Fallback to English if translation not found
741
+ return translations["English"].get(key, key)
742
+
743
+ def get_language_code(display_name: str) -> str:
744
+ """Convert display name to language code"""
745
+ language_map = {
746
+ "English": "English",
747
+ "中文 (Chinese)": "中文",
748
+ "Bahasa Malaysia": "Malay",
749
+ "ไทย (Thai)": "ไทย",
750
+ "Bahasa Indonesia": "Indonesian",
751
+ "Tiếng Việt (Vietnamese)": "Tiếng Việt"
752
+ }
753
+ return language_map.get(display_name, "English")