Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitignore +113 -0
- README.md +10 -7
- app_gradio.py +137 -0
- requirements.txt +155 -0
- runtime.txt +1 -0
- sample_documents/sample_university_requirements.txt +177 -0
- semantic_chunking.py +420 -0
- styles.css +126 -0
- tabs/help.py +168 -0
- tabs/initialize.py +55 -0
- tabs/manage.py +237 -0
- tabs/query.py +139 -0
- tabs/upload.py +99 -0
- utils/display.py +79 -0
- utils/rag_system.py +615 -0
- utils/translations.py +753 -0
.gitignore
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
pip-wheel-metadata/
|
| 20 |
+
share/python-wheels/
|
| 21 |
+
*.egg-info/
|
| 22 |
+
.installed.cfg
|
| 23 |
+
*.egg
|
| 24 |
+
MANIFEST
|
| 25 |
+
tabs/__pycache__/
|
| 26 |
+
.gradio
|
| 27 |
+
|
| 28 |
+
# Virtual Environment
|
| 29 |
+
.venv/
|
| 30 |
+
.env/
|
| 31 |
+
venv/
|
| 32 |
+
ENV/
|
| 33 |
+
env/
|
| 34 |
+
.venv
|
| 35 |
+
myenv/
|
| 36 |
+
gradio/
|
| 37 |
+
|
| 38 |
+
# Environment Variables
|
| 39 |
+
.env
|
| 40 |
+
.env.local
|
| 41 |
+
.env.development.local
|
| 42 |
+
.env.test.local
|
| 43 |
+
.env.production.local
|
| 44 |
+
|
| 45 |
+
# IDE
|
| 46 |
+
.vscode/
|
| 47 |
+
.idea/
|
| 48 |
+
*.swp
|
| 49 |
+
*.swo
|
| 50 |
+
*~
|
| 51 |
+
|
| 52 |
+
# macOS
|
| 53 |
+
.DS_Store
|
| 54 |
+
.AppleDouble
|
| 55 |
+
.LSOverride
|
| 56 |
+
|
| 57 |
+
# Windows
|
| 58 |
+
Thumbs.db
|
| 59 |
+
ehthumbs.db
|
| 60 |
+
Desktop.ini
|
| 61 |
+
|
| 62 |
+
# Jupyter Notebooks
|
| 63 |
+
.ipynb_checkpoints
|
| 64 |
+
|
| 65 |
+
# AI/ML specific
|
| 66 |
+
chroma_db/
|
| 67 |
+
chromadb/
|
| 68 |
+
*.db
|
| 69 |
+
*.sqlite
|
| 70 |
+
*.sqlite3
|
| 71 |
+
|
| 72 |
+
# Document storage
|
| 73 |
+
documents/
|
| 74 |
+
uploaded_documents/
|
| 75 |
+
temp_documents/
|
| 76 |
+
|
| 77 |
+
# Query results and cache
|
| 78 |
+
query_results/
|
| 79 |
+
.cache/
|
| 80 |
+
.streamlit/
|
| 81 |
+
|
| 82 |
+
# Model downloads and cache
|
| 83 |
+
models/
|
| 84 |
+
.transformers_cache/
|
| 85 |
+
.huggingface/
|
| 86 |
+
sentence_transformers_cache/
|
| 87 |
+
|
| 88 |
+
# Logs
|
| 89 |
+
*.log
|
| 90 |
+
logs/
|
| 91 |
+
.logs/
|
| 92 |
+
|
| 93 |
+
# Temporary files
|
| 94 |
+
tmp/
|
| 95 |
+
temp/
|
| 96 |
+
.tmp/
|
| 97 |
+
|
| 98 |
+
# Coverage reports
|
| 99 |
+
htmlcov/
|
| 100 |
+
.tox/
|
| 101 |
+
.coverage
|
| 102 |
+
.coverage.*
|
| 103 |
+
.cache
|
| 104 |
+
nosetests.xml
|
| 105 |
+
coverage.xml
|
| 106 |
+
*.cover
|
| 107 |
+
.hypothesis/
|
| 108 |
+
.pytest_cache/
|
| 109 |
+
|
| 110 |
+
# mypy
|
| 111 |
+
.mypy_cache/
|
| 112 |
+
.dmypy.json
|
| 113 |
+
dmypy.json
|
README.md
CHANGED
|
@@ -1,12 +1,15 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
|
| 8 |
-
|
|
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: TopEdu_Demo
|
| 3 |
+
emoji: 🎓
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
python_version: 3.11
|
| 8 |
+
sdk_version: "5.42.0"
|
| 9 |
+
app_file: app_gradio.py
|
| 10 |
pinned: false
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# TopEdu Gradio App
|
| 14 |
+
|
| 15 |
+
This is a demo of the TopEdu application.
|
app_gradio.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PANSEA University Requirements Assistant - Gradio Version (Modular)
|
| 3 |
+
A comprehensive tool for navigating university admission requirements across Southeast Asia.
|
| 4 |
+
"""
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
# Add the current directory to Python path for imports
|
| 11 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 12 |
+
|
| 13 |
+
# Import our RAG system
|
| 14 |
+
from utils.rag_system import DocumentIngestion, RAGSystem
|
| 15 |
+
|
| 16 |
+
# Import modular tab components
|
| 17 |
+
from tabs.initialize import create_initialize_tab
|
| 18 |
+
from tabs.upload import create_upload_tab
|
| 19 |
+
from tabs.query import create_query_tab
|
| 20 |
+
from tabs.manage import create_manage_tab
|
| 21 |
+
from tabs.help import create_help_tab
|
| 22 |
+
|
| 23 |
+
def create_interface():
|
| 24 |
+
"""Create the main Gradio interface using modular components"""
|
| 25 |
+
|
| 26 |
+
# Global state management - shared across all tabs
|
| 27 |
+
global_vars = {
|
| 28 |
+
'doc_ingestion': None,
|
| 29 |
+
'rag_system': None,
|
| 30 |
+
'vectorstore': None
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
# Custom CSS for better styling
|
| 34 |
+
custom_css = """
|
| 35 |
+
.gradio-container {
|
| 36 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 37 |
+
}
|
| 38 |
+
.tab-nav button {
|
| 39 |
+
font-weight: 500;
|
| 40 |
+
font-size: 14px;
|
| 41 |
+
}
|
| 42 |
+
.tab-nav button[aria-selected="true"] {
|
| 43 |
+
background: linear-gradient(45deg, #1e3a8a, #3b82f6);
|
| 44 |
+
color: white;
|
| 45 |
+
}
|
| 46 |
+
.feedback-box {
|
| 47 |
+
background: #f8fafc;
|
| 48 |
+
border: 1px solid #e2e8f0;
|
| 49 |
+
border-radius: 8px;
|
| 50 |
+
padding: 16px;
|
| 51 |
+
margin: 8px 0;
|
| 52 |
+
}
|
| 53 |
+
.success-message {
|
| 54 |
+
background: #dcfce7;
|
| 55 |
+
color: #166534;
|
| 56 |
+
border: 1px solid #bbf7d0;
|
| 57 |
+
padding: 12px;
|
| 58 |
+
border-radius: 6px;
|
| 59 |
+
margin: 8px 0;
|
| 60 |
+
}
|
| 61 |
+
.error-message {
|
| 62 |
+
background: #fef2f2;
|
| 63 |
+
color: #dc2626;
|
| 64 |
+
border: 1px solid #fecaca;
|
| 65 |
+
padding: 12px;
|
| 66 |
+
border-radius: 6px;
|
| 67 |
+
margin: 8px 0;
|
| 68 |
+
}
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
# Create the main interface
|
| 72 |
+
with gr.Blocks(
|
| 73 |
+
title="🌏 PANSEA University Assistant",
|
| 74 |
+
theme=gr.themes.Soft(
|
| 75 |
+
primary_hue="blue",
|
| 76 |
+
secondary_hue="slate"
|
| 77 |
+
),
|
| 78 |
+
css=custom_css,
|
| 79 |
+
analytics_enabled=False
|
| 80 |
+
) as interface:
|
| 81 |
+
|
| 82 |
+
# Header
|
| 83 |
+
gr.Markdown("""
|
| 84 |
+
# 🌏 TopEdu
|
| 85 |
+
|
| 86 |
+
**Navigate University Admission Requirements Across Southeast Asia with AI-Powered Assistance**
|
| 87 |
+
|
| 88 |
+
Upload university documents, ask questions, and get intelligent answers about admission requirements,
|
| 89 |
+
programs, deadlines, and more across Southeast Asian universities.
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
""")
|
| 93 |
+
|
| 94 |
+
# Main tabs using modular components
|
| 95 |
+
with gr.Tabs():
|
| 96 |
+
create_initialize_tab(global_vars)
|
| 97 |
+
create_upload_tab(global_vars)
|
| 98 |
+
create_query_tab(global_vars)
|
| 99 |
+
create_manage_tab(global_vars)
|
| 100 |
+
create_help_tab(global_vars)
|
| 101 |
+
|
| 102 |
+
# Footer
|
| 103 |
+
gr.Markdown(f"""
|
| 104 |
+
---
|
| 105 |
+
|
| 106 |
+
**🔧 System Status**: Ready | **📅 Session**: {datetime.now().strftime('%Y-%m-%d %H:%M')} | **🔄 Version**: Modular Gradio
|
| 107 |
+
|
| 108 |
+
💡 **Tip**: Start by initializing the system, then upload your university documents, and begin querying!
|
| 109 |
+
""")
|
| 110 |
+
|
| 111 |
+
return interface
|
| 112 |
+
|
| 113 |
+
def main():
|
| 114 |
+
"""Launch the application"""
|
| 115 |
+
interface = create_interface()
|
| 116 |
+
|
| 117 |
+
# Launch configuration
|
| 118 |
+
interface.launch(
|
| 119 |
+
share=False, # Set to True for public sharing
|
| 120 |
+
server_name="0.0.0.0", # Allow external connections
|
| 121 |
+
server_port=7860, # Default Gradio port
|
| 122 |
+
show_api=False, # Hide API documentation
|
| 123 |
+
show_error=True, # Show detailed error messages
|
| 124 |
+
quiet=False, # Show startup messages
|
| 125 |
+
favicon_path=None, # Could add custom favicon
|
| 126 |
+
app_kwargs={
|
| 127 |
+
"docs_url": None, # Disable FastAPI docs
|
| 128 |
+
"redoc_url": None # Disable ReDoc docs
|
| 129 |
+
}
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
print("🚀 Starting PANSEA University Requirements Assistant...")
|
| 134 |
+
print("📍 Access the application at: http://localhost:7860")
|
| 135 |
+
print("🔗 For public sharing, set share=True in the launch() method")
|
| 136 |
+
print("-" * 60)
|
| 137 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==24.1.0
|
| 2 |
+
aiohappyeyeballs==2.6.1
|
| 3 |
+
aiohttp==3.12.15
|
| 4 |
+
aiosignal==1.4.0
|
| 5 |
+
altair==5.5.0
|
| 6 |
+
annotated-types==0.7.0
|
| 7 |
+
anyio==4.10.0
|
| 8 |
+
attrs==25.3.0
|
| 9 |
+
backoff==2.2.1
|
| 10 |
+
bcrypt==4.3.0
|
| 11 |
+
blinker==1.9.0
|
| 12 |
+
Brotli==1.1.0
|
| 13 |
+
build==1.3.0
|
| 14 |
+
cachetools==5.5.2
|
| 15 |
+
certifi==2025.8.3
|
| 16 |
+
charset-normalizer==3.4.3
|
| 17 |
+
chromadb==1.0.16
|
| 18 |
+
click==8.2.1
|
| 19 |
+
coloredlogs==15.0.1
|
| 20 |
+
dataclasses-json==0.6.7
|
| 21 |
+
distro==1.9.0
|
| 22 |
+
durationpy==0.10
|
| 23 |
+
fastapi==0.116.1
|
| 24 |
+
ffmpy==0.6.1
|
| 25 |
+
filelock==3.18.0
|
| 26 |
+
flatbuffers==25.2.10
|
| 27 |
+
frozenlist==1.7.0
|
| 28 |
+
fsspec==2025.7.0
|
| 29 |
+
gitdb==4.0.12
|
| 30 |
+
GitPython==3.1.45
|
| 31 |
+
google-auth==2.40.3
|
| 32 |
+
googleapis-common-protos==1.70.0
|
| 33 |
+
gradio==5.42.0
|
| 34 |
+
gradio_client==1.11.1
|
| 35 |
+
groovy==0.1.2
|
| 36 |
+
grpcio==1.74.0
|
| 37 |
+
h11==0.16.0
|
| 38 |
+
hf-xet==1.1.7
|
| 39 |
+
httpcore==1.0.9
|
| 40 |
+
httptools==0.6.4
|
| 41 |
+
httpx==0.28.1
|
| 42 |
+
httpx-sse==0.4.1
|
| 43 |
+
huggingface-hub==0.34.4
|
| 44 |
+
humanfriendly==10.0
|
| 45 |
+
idna==3.10
|
| 46 |
+
importlib_metadata==8.7.0
|
| 47 |
+
importlib_resources==6.5.2
|
| 48 |
+
Jinja2==3.1.6
|
| 49 |
+
jiter==0.10.0
|
| 50 |
+
joblib==1.5.1
|
| 51 |
+
jsonpatch==1.33
|
| 52 |
+
jsonpointer==3.0.0
|
| 53 |
+
jsonschema==4.25.0
|
| 54 |
+
jsonschema-specifications==2025.4.1
|
| 55 |
+
kubernetes==33.1.0
|
| 56 |
+
langchain==0.3.27
|
| 57 |
+
langchain-community==0.3.27
|
| 58 |
+
langchain-core==0.3.74
|
| 59 |
+
langchain-openai==0.3.29
|
| 60 |
+
langchain-text-splitters==0.3.9
|
| 61 |
+
langsmith==0.4.13
|
| 62 |
+
markdown-it-py==4.0.0
|
| 63 |
+
MarkupSafe==3.0.2
|
| 64 |
+
marshmallow==3.26.1
|
| 65 |
+
mdurl==0.1.2
|
| 66 |
+
mmh3==5.2.0
|
| 67 |
+
mpmath==1.3.0
|
| 68 |
+
multidict==6.6.4
|
| 69 |
+
mypy_extensions==1.1.0
|
| 70 |
+
narwhals==2.1.0
|
| 71 |
+
networkx==3.5
|
| 72 |
+
numpy==2.3.2
|
| 73 |
+
oauthlib==3.3.1
|
| 74 |
+
onnxruntime==1.22.1
|
| 75 |
+
openai==1.99.9
|
| 76 |
+
opentelemetry-api==1.36.0
|
| 77 |
+
opentelemetry-exporter-otlp-proto-common==1.36.0
|
| 78 |
+
opentelemetry-exporter-otlp-proto-grpc==1.36.0
|
| 79 |
+
opentelemetry-proto==1.36.0
|
| 80 |
+
opentelemetry-sdk==1.36.0
|
| 81 |
+
opentelemetry-semantic-conventions==0.57b0
|
| 82 |
+
orjson==3.11.1
|
| 83 |
+
overrides==7.7.0
|
| 84 |
+
packaging==25.0
|
| 85 |
+
pandas==2.3.1
|
| 86 |
+
pillow==11.3.0
|
| 87 |
+
posthog==5.4.0
|
| 88 |
+
propcache==0.3.2
|
| 89 |
+
protobuf==6.31.1
|
| 90 |
+
pyarrow==21.0.0
|
| 91 |
+
pyasn1==0.6.1
|
| 92 |
+
pyasn1_modules==0.4.2
|
| 93 |
+
pybase64==1.4.2
|
| 94 |
+
pycryptodome==3.23.0
|
| 95 |
+
pydantic==2.11.7
|
| 96 |
+
pydantic-settings==2.10.1
|
| 97 |
+
pydantic_core==2.33.2
|
| 98 |
+
pydeck==0.9.1
|
| 99 |
+
pydub==0.25.1
|
| 100 |
+
Pygments==2.19.2
|
| 101 |
+
PyPDF2==3.0.1
|
| 102 |
+
PyPika==0.48.9
|
| 103 |
+
pyproject_hooks==1.2.0
|
| 104 |
+
python-dateutil==2.9.0.post0
|
| 105 |
+
python-dotenv==1.1.1
|
| 106 |
+
python-multipart==0.0.20
|
| 107 |
+
pytz==2025.2
|
| 108 |
+
PyYAML==6.0.2
|
| 109 |
+
referencing==0.36.2
|
| 110 |
+
regex==2025.7.34
|
| 111 |
+
requests==2.32.4
|
| 112 |
+
requests-oauthlib==2.0.0
|
| 113 |
+
requests-toolbelt==1.0.0
|
| 114 |
+
rich==14.1.0
|
| 115 |
+
rpds-py==0.27.0
|
| 116 |
+
rsa==4.9.1
|
| 117 |
+
ruff==0.12.8
|
| 118 |
+
safehttpx==0.1.6
|
| 119 |
+
safetensors==0.6.2
|
| 120 |
+
scikit-learn==1.7.1
|
| 121 |
+
scipy==1.16.1
|
| 122 |
+
semantic-version==2.10.0
|
| 123 |
+
sentence-transformers==5.1.0
|
| 124 |
+
shellingham==1.5.4
|
| 125 |
+
six==1.17.0
|
| 126 |
+
smmap==5.0.2
|
| 127 |
+
sniffio==1.3.1
|
| 128 |
+
SQLAlchemy==2.0.43
|
| 129 |
+
starlette==0.47.2
|
| 130 |
+
streamlit==1.48.0
|
| 131 |
+
sympy==1.14.0
|
| 132 |
+
tenacity==9.1.2
|
| 133 |
+
threadpoolctl==3.6.0
|
| 134 |
+
tiktoken==0.11.0
|
| 135 |
+
tokenizers==0.21.4
|
| 136 |
+
toml==0.10.2
|
| 137 |
+
tomlkit==0.13.3
|
| 138 |
+
torch==2.8.0
|
| 139 |
+
tornado==6.5.2
|
| 140 |
+
tqdm==4.67.1
|
| 141 |
+
transformers==4.55.0
|
| 142 |
+
typer==0.16.0
|
| 143 |
+
typing-inspect==0.9.0
|
| 144 |
+
typing-inspection==0.4.1
|
| 145 |
+
typing_extensions==4.14.1
|
| 146 |
+
tzdata==2025.2
|
| 147 |
+
urllib3==2.5.0
|
| 148 |
+
uvicorn==0.35.0
|
| 149 |
+
uvloop==0.21.0
|
| 150 |
+
watchfiles==1.1.0
|
| 151 |
+
websocket-client==1.8.0
|
| 152 |
+
websockets==15.0.1
|
| 153 |
+
yarl==1.20.1
|
| 154 |
+
zipp==3.23.0
|
| 155 |
+
zstandard==0.23.0
|
runtime.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python-3.11
|
sample_documents/sample_university_requirements.txt
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sample University Admission Requirements
|
| 2 |
+
|
| 3 |
+
## National University of Singapore (NUS) - Computer Science Master's Program
|
| 4 |
+
|
| 5 |
+
### Program Overview
|
| 6 |
+
The Master of Computing (Computer Science) program at NUS is a comprehensive graduate program designed for students seeking advanced knowledge in computer science.
|
| 7 |
+
|
| 8 |
+
### Admission Requirements
|
| 9 |
+
|
| 10 |
+
#### Academic Requirements
|
| 11 |
+
- Bachelor's degree in Computer Science, Computer Engineering, or related field
|
| 12 |
+
- Minimum GPA of 3.5/4.0 or equivalent (Second Class Upper Honours)
|
| 13 |
+
- Strong background in mathematics and programming
|
| 14 |
+
|
| 15 |
+
#### English Proficiency Requirements
|
| 16 |
+
For international students whose native language is not English:
|
| 17 |
+
- IELTS: Minimum overall score of 6.5 (no band less than 6.0)
|
| 18 |
+
- TOEFL iBT: Minimum score of 85 (writing minimum 22)
|
| 19 |
+
- PTE Academic: Minimum score of 65
|
| 20 |
+
|
| 21 |
+
#### Additional Requirements
|
| 22 |
+
- Statement of Purpose (500-1000 words)
|
| 23 |
+
- Two letters of recommendation from academic or professional referees
|
| 24 |
+
- Resume/CV highlighting relevant experience
|
| 25 |
+
- Portfolio of programming projects (preferred)
|
| 26 |
+
|
| 27 |
+
### Tuition Fees (2024-2025 Academic Year)
|
| 28 |
+
- Singapore Citizens: S$12,500 per year
|
| 29 |
+
- Singapore Permanent Residents: S$17,500 per year
|
| 30 |
+
- International Students: S$25,000 per year
|
| 31 |
+
|
| 32 |
+
### Application Deadlines
|
| 33 |
+
- **Priority Round**: November 15, 2024
|
| 34 |
+
- **Regular Round**: January 31, 2025
|
| 35 |
+
- **Late Round**: March 15, 2025 (subject to availability)
|
| 36 |
+
|
| 37 |
+
### Application Process
|
| 38 |
+
1. Submit online application through NUS Graduate School portal
|
| 39 |
+
2. Upload required documents
|
| 40 |
+
3. Pay application fee of S$50
|
| 41 |
+
4. Submit by deadline
|
| 42 |
+
5. Attend interview if shortlisted (February-April)
|
| 43 |
+
6. Admission results: April-May
|
| 44 |
+
|
| 45 |
+
### Program Duration
|
| 46 |
+
- Full-time: 1.5 years (3 semesters)
|
| 47 |
+
- Part-time: 2.5 years (5 semesters)
|
| 48 |
+
|
| 49 |
+
### Financial Aid
|
| 50 |
+
- NUS Graduate Research Scholarship available for qualifying students
|
| 51 |
+
- Teaching assistantships for outstanding applicants
|
| 52 |
+
- Industry sponsorship opportunities
|
| 53 |
+
|
| 54 |
+
### Contact Information
|
| 55 |
+
- Email: gradsch@nus.edu.sg
|
| 56 |
+
- Phone: +65 6516 2492
|
| 57 |
+
- Website: www.nus.edu.sg/graduateschool
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## University of Malaya (UM) - Engineering Master's Programs
|
| 62 |
+
|
| 63 |
+
### Program Overview
|
| 64 |
+
The Faculty of Engineering offers various Master's degree programs in engineering disciplines.
|
| 65 |
+
|
| 66 |
+
### Admission Requirements
|
| 67 |
+
|
| 68 |
+
#### Academic Requirements
|
| 69 |
+
- Bachelor's degree in Engineering or related field with minimum CGPA of 3.0/4.0
|
| 70 |
+
- For applicants with CGPA below 3.0, relevant work experience of at least 2 years required
|
| 71 |
+
|
| 72 |
+
#### English Proficiency Requirements
|
| 73 |
+
For international students:
|
| 74 |
+
- IELTS: Minimum overall score of 6.0 (no band less than 5.5)
|
| 75 |
+
- TOEFL iBT: Minimum score of 80
|
| 76 |
+
- MUET (Malaysian University English Test): Band 4 minimum
|
| 77 |
+
|
| 78 |
+
#### Program-Specific Requirements
|
| 79 |
+
- **Civil Engineering**: AutoCAD proficiency preferred
|
| 80 |
+
- **Electrical Engineering**: Basic knowledge of circuit analysis
|
| 81 |
+
- **Mechanical Engineering**: Thermodynamics and fluid mechanics background
|
| 82 |
+
|
| 83 |
+
### Tuition Fees (2024 Academic Year)
|
| 84 |
+
- Malaysian Citizens: RM 8,000 per year
|
| 85 |
+
- International Students: RM 15,000 per year
|
| 86 |
+
- ASEAN Students: RM 12,000 per year (special rate)
|
| 87 |
+
|
| 88 |
+
### Application Deadlines
|
| 89 |
+
- **Main Intake (September)**: April 30, 2024
|
| 90 |
+
- **Second Intake (February)**: October 31, 2024
|
| 91 |
+
|
| 92 |
+
### Scholarships Available
|
| 93 |
+
- UM Graduate Merit Scholarship (50% tuition fee waiver)
|
| 94 |
+
- ASEAN Scholarship Program
|
| 95 |
+
- Industry-sponsored scholarships
|
| 96 |
+
|
| 97 |
+
### Living Costs (Estimated per month)
|
| 98 |
+
- Accommodation: RM 500-800
|
| 99 |
+
- Food: RM 400-600
|
| 100 |
+
- Transportation: RM 100-200
|
| 101 |
+
- Other expenses: RM 200-300
|
| 102 |
+
- **Total: RM 1,200-1,900 per month**
|
| 103 |
+
|
| 104 |
+
### Application Requirements
|
| 105 |
+
1. Completed application form
|
| 106 |
+
2. Academic transcripts
|
| 107 |
+
3. Bachelor's degree certificate
|
| 108 |
+
4. English proficiency test results
|
| 109 |
+
5. Two reference letters
|
| 110 |
+
6. Research proposal (for research-based programs)
|
| 111 |
+
7. Passport copy
|
| 112 |
+
8. Passport-sized photographs
|
| 113 |
+
|
| 114 |
+
### Contact Information
|
| 115 |
+
- Email: admission@um.edu.my
|
| 116 |
+
- Phone: +603 7967 3026
|
| 117 |
+
- Address: Faculty of Engineering, University of Malaya, 50603 Kuala Lumpur, Malaysia
|
| 118 |
+
|
| 119 |
+
---
|
| 120 |
+
|
| 121 |
+
## Chulalongkorn University - Business Administration Master's (MBA)
|
| 122 |
+
|
| 123 |
+
### Program Overview
|
| 124 |
+
The Chulalongkorn Business School MBA program is Thailand's premier business education program.
|
| 125 |
+
|
| 126 |
+
### Admission Requirements
|
| 127 |
+
|
| 128 |
+
#### Academic Requirements
|
| 129 |
+
- Bachelor's degree from accredited institution
|
| 130 |
+
- Minimum GPA of 2.75/4.0 or equivalent
|
| 131 |
+
- GMAT score of 500+ (preferred) or GRE equivalent
|
| 132 |
+
- Minimum 2 years of work experience
|
| 133 |
+
|
| 134 |
+
#### English Proficiency Requirements
|
| 135 |
+
- TOEFL iBT: Minimum score of 79
|
| 136 |
+
- IELTS: Minimum overall score of 6.5
|
| 137 |
+
- CU-TEP: Minimum score of 80
|
| 138 |
+
|
| 139 |
+
### Tuition Fees (2024-2025)
|
| 140 |
+
- Full-time MBA: 850,000 THB (approx. USD 25,000) total program
|
| 141 |
+
- Executive MBA: 1,200,000 THB (approx. USD 35,000) total program
|
| 142 |
+
|
| 143 |
+
### Application Deadlines
|
| 144 |
+
- **Early Admission**: February 15, 2024
|
| 145 |
+
- **Regular Admission**: April 30, 2024
|
| 146 |
+
- **Final Round**: June 15, 2024
|
| 147 |
+
|
| 148 |
+
### Program Duration
|
| 149 |
+
- Full-time MBA: 16 months
|
| 150 |
+
- Executive MBA: 18 months (weekend classes)
|
| 151 |
+
|
| 152 |
+
### Scholarships
|
| 153 |
+
- Merit-based scholarships up to 50% tuition
|
| 154 |
+
- Corporate sponsorship opportunities
|
| 155 |
+
- Government scholarships for ASEAN students
|
| 156 |
+
|
| 157 |
+
### Application Process
|
| 158 |
+
1. Online application submission
|
| 159 |
+
2. Submit required documents
|
| 160 |
+
3. GMAT/GRE scores
|
| 161 |
+
4. Personal interview
|
| 162 |
+
5. Group discussion assessment
|
| 163 |
+
|
| 164 |
+
### Career Support
|
| 165 |
+
- Career counseling services
|
| 166 |
+
- Industry networking events
|
| 167 |
+
- Internship placement assistance
|
| 168 |
+
- Alumni network access
|
| 169 |
+
|
| 170 |
+
### Contact Information
|
| 171 |
+
- Email: mba@cbs.chula.ac.th
|
| 172 |
+
- Phone: +66 2 218 6601
|
| 173 |
+
- Website: www.cbs.chula.ac.th
|
| 174 |
+
|
| 175 |
+
---
|
| 176 |
+
|
| 177 |
+
*This document contains sample admission information for demonstration purposes. Please verify all details with the respective universities before applying.*
|
semantic_chunking.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Semantic Chunker Module for RAG Systems
|
| 3 |
+
======================================
|
| 4 |
+
|
| 5 |
+
A drop-in replacement for RecursiveCharacterTextSplitter that uses semantic similarity
|
| 6 |
+
to create more coherent chunks. Designed to work seamlessly with existing LangChain
|
| 7 |
+
and Streamlit RAG systems.
|
| 8 |
+
|
| 9 |
+
Author: AI Assistant
|
| 10 |
+
Compatible with: LangChain, BGE embeddings, OpenAI embeddings, Streamlit
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
import re
|
| 15 |
+
from typing import List, Dict, Any, Optional, Union
|
| 16 |
+
from langchain.schema import Document
|
| 17 |
+
import streamlit as st
|
| 18 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 19 |
+
import logging
|
| 20 |
+
|
| 21 |
+
# Set up logging
|
| 22 |
+
logging.basicConfig(level=logging.INFO)
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
class SemanticChunker:
|
| 26 |
+
"""
|
| 27 |
+
Advanced semantic document chunker that creates coherent chunks based on
|
| 28 |
+
semantic similarity rather than fixed character counts.
|
| 29 |
+
|
| 30 |
+
Perfect for university documents, research papers, and policy documents
|
| 31 |
+
where maintaining semantic coherence is crucial.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(self,
|
| 35 |
+
embeddings_model,
|
| 36 |
+
chunk_size: int = 4,
|
| 37 |
+
overlap: int = 1,
|
| 38 |
+
similarity_threshold: float = 0.75,
|
| 39 |
+
min_chunk_size: int = 150,
|
| 40 |
+
max_chunk_size: int = 1500,
|
| 41 |
+
sentence_split_pattern: Optional[str] = None,
|
| 42 |
+
debug: bool = False):
|
| 43 |
+
"""
|
| 44 |
+
Initialize the semantic chunker.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
embeddings_model: Your existing embeddings model (BGE, OpenAI, etc.)
|
| 48 |
+
chunk_size: Base number of sentences per chunk (default: 4)
|
| 49 |
+
overlap: Number of sentences to overlap between chunks (default: 1)
|
| 50 |
+
similarity_threshold: Cosine similarity threshold for extending chunks (0.0-1.0)
|
| 51 |
+
min_chunk_size: Minimum characters per chunk (skip smaller chunks)
|
| 52 |
+
max_chunk_size: Maximum characters per chunk (prevent overly large chunks)
|
| 53 |
+
sentence_split_pattern: Custom regex pattern for sentence splitting
|
| 54 |
+
debug: Enable debug logging and statistics
|
| 55 |
+
"""
|
| 56 |
+
self.embeddings_model = embeddings_model
|
| 57 |
+
self.chunk_size = chunk_size
|
| 58 |
+
self.overlap = overlap
|
| 59 |
+
self.similarity_threshold = similarity_threshold
|
| 60 |
+
self.min_chunk_size = min_chunk_size
|
| 61 |
+
self.max_chunk_size = max_chunk_size
|
| 62 |
+
self.debug = debug
|
| 63 |
+
|
| 64 |
+
# Default sentence splitting pattern optimized for academic/university documents
|
| 65 |
+
self.sentence_pattern = sentence_split_pattern or r'[.!?]+\s+'
|
| 66 |
+
|
| 67 |
+
# Statistics tracking
|
| 68 |
+
self.stats = {
|
| 69 |
+
"total_documents": 0,
|
| 70 |
+
"total_chunks": 0,
|
| 71 |
+
"avg_chunk_size": 0,
|
| 72 |
+
"chunking_methods": {},
|
| 73 |
+
"embedding_errors": 0
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
if self.debug:
|
| 77 |
+
logger.info(f"Initialized SemanticChunker with threshold={similarity_threshold}")
|
| 78 |
+
|
| 79 |
+
def _detect_embedding_model_type(self) -> str:
|
| 80 |
+
"""Detect the type of embedding model being used."""
|
| 81 |
+
if hasattr(self.embeddings_model, 'model'):
|
| 82 |
+
# Likely sentence-transformers model (BGE, etc.)
|
| 83 |
+
model_name = getattr(self.embeddings_model.model, 'model_name', 'sentence-transformers')
|
| 84 |
+
return f"sentence-transformers ({model_name})"
|
| 85 |
+
elif hasattr(self.embeddings_model, 'client'):
|
| 86 |
+
# Likely OpenAI
|
| 87 |
+
return "OpenAI"
|
| 88 |
+
else:
|
| 89 |
+
return "Unknown"
|
| 90 |
+
|
| 91 |
+
def _preprocess_text_for_splitting(self, text: str) -> str:
|
| 92 |
+
"""
|
| 93 |
+
Preprocess text to handle common formatting issues in university documents.
|
| 94 |
+
"""
|
| 95 |
+
# Fix common formatting issues
|
| 96 |
+
fixes = [
|
| 97 |
+
# Add space after periods before capital letters
|
| 98 |
+
(r'([a-z])\.([A-Z])', r'\1. \2'),
|
| 99 |
+
# Add space after numbers with periods
|
| 100 |
+
(r'([0-9]+)\.([A-Z])', r'\1. \2'),
|
| 101 |
+
# Fix missing spaces after question/exclamation marks
|
| 102 |
+
(r'([a-z])\?([A-Z])', r'\1? \2'),
|
| 103 |
+
(r'([a-z])\!([A-Z])', r'\1! \2'),
|
| 104 |
+
# Clean up multiple spaces
|
| 105 |
+
(r'\s+', ' '),
|
| 106 |
+
# Fix bullet points
|
| 107 |
+
(r'•\s*([A-Z])', r'• \1'),
|
| 108 |
+
(r'-\s*([A-Z])', r'- \1'),
|
| 109 |
+
]
|
| 110 |
+
|
| 111 |
+
processed_text = text
|
| 112 |
+
for pattern, replacement in fixes:
|
| 113 |
+
processed_text = re.sub(pattern, replacement, processed_text)
|
| 114 |
+
|
| 115 |
+
return processed_text.strip()
|
| 116 |
+
|
| 117 |
+
def _split_into_sentences(self, text: str) -> List[str]:
|
| 118 |
+
"""
|
| 119 |
+
Advanced sentence splitting optimized for academic documents.
|
| 120 |
+
"""
|
| 121 |
+
# Preprocess text
|
| 122 |
+
text = self._preprocess_text_for_splitting(text)
|
| 123 |
+
|
| 124 |
+
# Split on sentence boundaries
|
| 125 |
+
raw_sentences = re.split(self.sentence_pattern, text)
|
| 126 |
+
|
| 127 |
+
# Clean and filter sentences
|
| 128 |
+
sentences = []
|
| 129 |
+
for sentence in raw_sentences:
|
| 130 |
+
sentence = sentence.strip()
|
| 131 |
+
|
| 132 |
+
# Filter out very short sentences, pure numbers, or empty strings
|
| 133 |
+
if len(sentence) >= 10 and not sentence.isdigit() and not re.match(r'^[^\w]*$', sentence):
|
| 134 |
+
sentences.append(sentence)
|
| 135 |
+
|
| 136 |
+
if self.debug:
|
| 137 |
+
logger.info(f"Split text into {len(sentences)} sentences")
|
| 138 |
+
|
| 139 |
+
return sentences
|
| 140 |
+
|
| 141 |
+
def _get_embeddings(self, texts: List[str]) -> Optional[np.ndarray]:
|
| 142 |
+
"""
|
| 143 |
+
Get embeddings from the provided model with error handling.
|
| 144 |
+
"""
|
| 145 |
+
try:
|
| 146 |
+
if hasattr(self.embeddings_model, 'model'):
|
| 147 |
+
# sentence-transformers model (BGE, etc.)
|
| 148 |
+
embeddings = self.embeddings_model.model.encode(texts)
|
| 149 |
+
return np.array(embeddings)
|
| 150 |
+
elif hasattr(self.embeddings_model, 'embed_documents'):
|
| 151 |
+
# OpenAI or similar API-based embeddings
|
| 152 |
+
embeddings = self.embeddings_model.embed_documents(texts)
|
| 153 |
+
return np.array(embeddings)
|
| 154 |
+
else:
|
| 155 |
+
# Try direct call
|
| 156 |
+
embeddings = self.embeddings_model(texts)
|
| 157 |
+
return np.array(embeddings)
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
self.stats["embedding_errors"] += 1
|
| 161 |
+
if self.debug:
|
| 162 |
+
logger.error(f"Error generating embeddings: {e}")
|
| 163 |
+
|
| 164 |
+
# Show warning in Streamlit if available
|
| 165 |
+
try:
|
| 166 |
+
st.warning(f"⚠️ Embedding error, falling back to simple chunking: {str(e)[:100]}...")
|
| 167 |
+
except:
|
| 168 |
+
pass # Streamlit not available
|
| 169 |
+
|
| 170 |
+
return None
|
| 171 |
+
|
| 172 |
+
def _calculate_semantic_boundaries(self, embeddings: np.ndarray, sentences: List[str]) -> List[int]:
|
| 173 |
+
"""
|
| 174 |
+
Find natural semantic boundaries in the text based on embedding similarities.
|
| 175 |
+
"""
|
| 176 |
+
boundaries = [0] # Always start with first sentence
|
| 177 |
+
|
| 178 |
+
# Calculate similarities between consecutive sentences
|
| 179 |
+
similarities = []
|
| 180 |
+
for i in range(len(embeddings) - 1):
|
| 181 |
+
sim = cosine_similarity(
|
| 182 |
+
embeddings[i:i+1],
|
| 183 |
+
embeddings[i+1:i+2]
|
| 184 |
+
)[0][0]
|
| 185 |
+
similarities.append(sim)
|
| 186 |
+
|
| 187 |
+
# Find significant drops in similarity (topic boundaries)
|
| 188 |
+
if len(similarities) > 1:
|
| 189 |
+
mean_sim = np.mean(similarities)
|
| 190 |
+
std_sim = np.std(similarities)
|
| 191 |
+
threshold = mean_sim - (0.5 * std_sim) # Adaptive threshold
|
| 192 |
+
|
| 193 |
+
for i, sim in enumerate(similarities):
|
| 194 |
+
if sim < threshold:
|
| 195 |
+
boundaries.append(i + 1)
|
| 196 |
+
|
| 197 |
+
boundaries.append(len(sentences)) # Always end with last sentence
|
| 198 |
+
|
| 199 |
+
return sorted(list(set(boundaries))) # Remove duplicates and sort
|
| 200 |
+
|
| 201 |
+
def _create_chunks_from_boundaries(self, sentences: List[str], boundaries: List[int],
|
| 202 |
+
embeddings: Optional[np.ndarray], metadata: Dict[str, Any]) -> List[Document]:
|
| 203 |
+
"""
|
| 204 |
+
Create document chunks based on semantic boundaries.
|
| 205 |
+
"""
|
| 206 |
+
chunks = []
|
| 207 |
+
|
| 208 |
+
for i in range(len(boundaries) - 1):
|
| 209 |
+
start_idx = boundaries[i]
|
| 210 |
+
end_idx = boundaries[i + 1]
|
| 211 |
+
|
| 212 |
+
# Create base chunk
|
| 213 |
+
chunk_sentences = sentences[start_idx:end_idx]
|
| 214 |
+
|
| 215 |
+
# Try to extend chunk if semantically similar
|
| 216 |
+
if embeddings is not None and end_idx < len(sentences):
|
| 217 |
+
current_embedding = np.mean(embeddings[start_idx:end_idx], axis=0, keepdims=True)
|
| 218 |
+
|
| 219 |
+
# Check if we can extend the chunk
|
| 220 |
+
extended_end = end_idx
|
| 221 |
+
while extended_end < len(sentences):
|
| 222 |
+
next_sentence_embedding = embeddings[extended_end:extended_end+1]
|
| 223 |
+
similarity = cosine_similarity(current_embedding, next_sentence_embedding)[0][0]
|
| 224 |
+
|
| 225 |
+
if similarity > self.similarity_threshold:
|
| 226 |
+
# Check size limit
|
| 227 |
+
test_chunk = ' '.join(sentences[start_idx:extended_end+1])
|
| 228 |
+
if len(test_chunk) <= self.max_chunk_size:
|
| 229 |
+
extended_end += 1
|
| 230 |
+
# Update current embedding
|
| 231 |
+
current_embedding = np.mean(embeddings[start_idx:extended_end], axis=0, keepdims=True)
|
| 232 |
+
else:
|
| 233 |
+
break
|
| 234 |
+
else:
|
| 235 |
+
break
|
| 236 |
+
|
| 237 |
+
# Use extended chunk if we found extensions
|
| 238 |
+
if extended_end > end_idx:
|
| 239 |
+
chunk_sentences = sentences[start_idx:extended_end]
|
| 240 |
+
|
| 241 |
+
# Create chunk text
|
| 242 |
+
chunk_text = ' '.join(chunk_sentences)
|
| 243 |
+
|
| 244 |
+
# Only add chunks that meet minimum size requirement
|
| 245 |
+
if len(chunk_text) >= self.min_chunk_size:
|
| 246 |
+
chunk_metadata = metadata.copy()
|
| 247 |
+
chunk_metadata.update({
|
| 248 |
+
"chunk_index": len(chunks),
|
| 249 |
+
"sentence_count": len(chunk_sentences),
|
| 250 |
+
"start_sentence": start_idx,
|
| 251 |
+
"end_sentence": start_idx + len(chunk_sentences) - 1,
|
| 252 |
+
"chunking_method": "semantic_boundary",
|
| 253 |
+
"similarity_threshold": self.similarity_threshold,
|
| 254 |
+
"chunk_size_chars": len(chunk_text)
|
| 255 |
+
})
|
| 256 |
+
|
| 257 |
+
chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
|
| 258 |
+
|
| 259 |
+
return chunks
|
| 260 |
+
|
| 261 |
+
def _create_simple_chunks(self, sentences: List[str], metadata: Dict[str, Any]) -> List[Document]:
|
| 262 |
+
"""
|
| 263 |
+
Fallback to simple sentence-based chunking when embeddings are unavailable.
|
| 264 |
+
"""
|
| 265 |
+
chunks = []
|
| 266 |
+
|
| 267 |
+
for i in range(0, len(sentences), max(1, self.chunk_size - self.overlap)):
|
| 268 |
+
chunk_sentences = sentences[i:i + self.chunk_size]
|
| 269 |
+
chunk_text = ' '.join(chunk_sentences)
|
| 270 |
+
|
| 271 |
+
if len(chunk_text) >= self.min_chunk_size:
|
| 272 |
+
chunk_metadata = metadata.copy()
|
| 273 |
+
chunk_metadata.update({
|
| 274 |
+
"chunk_index": len(chunks),
|
| 275 |
+
"sentence_count": len(chunk_sentences),
|
| 276 |
+
"start_sentence": i,
|
| 277 |
+
"end_sentence": i + len(chunk_sentences) - 1,
|
| 278 |
+
"chunking_method": "simple_fallback",
|
| 279 |
+
"chunk_size_chars": len(chunk_text)
|
| 280 |
+
})
|
| 281 |
+
|
| 282 |
+
chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
|
| 283 |
+
|
| 284 |
+
return chunks
|
| 285 |
+
|
| 286 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 287 |
+
"""
|
| 288 |
+
Main method: Split documents into semantically coherent chunks.
|
| 289 |
+
|
| 290 |
+
Args:
|
| 291 |
+
documents: List of LangChain Document objects
|
| 292 |
+
|
| 293 |
+
Returns:
|
| 294 |
+
List of Document objects with semantic chunks
|
| 295 |
+
"""
|
| 296 |
+
all_chunks = []
|
| 297 |
+
self.stats["total_documents"] = len(documents)
|
| 298 |
+
|
| 299 |
+
for doc_idx, doc in enumerate(documents):
|
| 300 |
+
try:
|
| 301 |
+
# Split document into sentences
|
| 302 |
+
sentences = self._split_into_sentences(doc.page_content)
|
| 303 |
+
|
| 304 |
+
if not sentences:
|
| 305 |
+
if self.debug:
|
| 306 |
+
logger.warning(f"No sentences found in document {doc_idx}")
|
| 307 |
+
continue
|
| 308 |
+
|
| 309 |
+
# Handle very short documents
|
| 310 |
+
if len(sentences) < self.chunk_size:
|
| 311 |
+
chunk_text = ' '.join(sentences)
|
| 312 |
+
if len(chunk_text) >= self.min_chunk_size:
|
| 313 |
+
chunk_metadata = doc.metadata.copy()
|
| 314 |
+
chunk_metadata.update({
|
| 315 |
+
"chunk_index": 0,
|
| 316 |
+
"total_chunks": 1,
|
| 317 |
+
"sentence_count": len(sentences),
|
| 318 |
+
"chunking_method": "single_chunk",
|
| 319 |
+
"chunk_size_chars": len(chunk_text)
|
| 320 |
+
})
|
| 321 |
+
all_chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
# Generate embeddings
|
| 325 |
+
embeddings = self._get_embeddings(sentences)
|
| 326 |
+
|
| 327 |
+
if embeddings is not None:
|
| 328 |
+
# Create semantic chunks
|
| 329 |
+
chunks = self._create_chunks_from_boundaries(sentences, [0, len(sentences)], embeddings, doc.metadata)
|
| 330 |
+
method = "semantic"
|
| 331 |
+
else:
|
| 332 |
+
# Fallback to simple chunking
|
| 333 |
+
chunks = self._create_simple_chunks(sentences, doc.metadata)
|
| 334 |
+
method = "simple_fallback"
|
| 335 |
+
|
| 336 |
+
# Update statistics
|
| 337 |
+
self.stats["chunking_methods"][method] = self.stats["chunking_methods"].get(method, 0) + 1
|
| 338 |
+
|
| 339 |
+
# Update total chunks count in each chunk's metadata
|
| 340 |
+
for chunk in chunks:
|
| 341 |
+
chunk.metadata["total_chunks"] = len(chunks)
|
| 342 |
+
chunk.metadata["source_document_index"] = doc_idx
|
| 343 |
+
|
| 344 |
+
all_chunks.extend(chunks)
|
| 345 |
+
|
| 346 |
+
if self.debug:
|
| 347 |
+
logger.info(f"Document {doc_idx}: {len(sentences)} sentences → {len(chunks)} chunks ({method})")
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
logger.error(f"Error processing document {doc_idx}: {e}")
|
| 351 |
+
if self.debug:
|
| 352 |
+
st.error(f"Error processing document {doc_idx}: {e}")
|
| 353 |
+
|
| 354 |
+
# Update final statistics
|
| 355 |
+
self.stats["total_chunks"] = len(all_chunks)
|
| 356 |
+
if all_chunks:
|
| 357 |
+
chunk_sizes = [len(chunk.page_content) for chunk in all_chunks]
|
| 358 |
+
self.stats["avg_chunk_size"] = sum(chunk_sizes) / len(chunk_sizes)
|
| 359 |
+
|
| 360 |
+
if self.debug:
|
| 361 |
+
logger.info(f"Created {len(all_chunks)} total chunks from {len(documents)} documents")
|
| 362 |
+
|
| 363 |
+
return all_chunks
|
| 364 |
+
|
| 365 |
+
def get_statistics(self) -> Dict[str, Any]:
|
| 366 |
+
"""Get chunking statistics for analysis."""
|
| 367 |
+
return self.stats.copy()
|
| 368 |
+
|
| 369 |
+
def display_statistics(self):
|
| 370 |
+
"""Display chunking statistics in Streamlit (if available)."""
|
| 371 |
+
try:
|
| 372 |
+
with st.expander("📊 Semantic Chunking Statistics"):
|
| 373 |
+
col1, col2 = st.columns(2)
|
| 374 |
+
|
| 375 |
+
with col1:
|
| 376 |
+
st.metric("Total Documents", self.stats["total_documents"])
|
| 377 |
+
st.metric("Total Chunks", self.stats["total_chunks"])
|
| 378 |
+
|
| 379 |
+
with col2:
|
| 380 |
+
st.metric("Avg Chunk Size", f"{self.stats['avg_chunk_size']:.0f} chars")
|
| 381 |
+
st.metric("Embedding Errors", self.stats["embedding_errors"])
|
| 382 |
+
|
| 383 |
+
if self.stats["chunking_methods"]:
|
| 384 |
+
st.write("**Chunking Methods Used:**")
|
| 385 |
+
for method, count in self.stats["chunking_methods"].items():
|
| 386 |
+
percentage = (count / self.stats["total_documents"]) * 100 if self.stats["total_documents"] > 0 else 0
|
| 387 |
+
st.write(f" - {method}: {count} documents ({percentage:.1f}%)")
|
| 388 |
+
|
| 389 |
+
st.write("**Configuration:**")
|
| 390 |
+
st.json({
|
| 391 |
+
"chunk_size": self.chunk_size,
|
| 392 |
+
"overlap": self.overlap,
|
| 393 |
+
"similarity_threshold": self.similarity_threshold,
|
| 394 |
+
"min_chunk_size": self.min_chunk_size,
|
| 395 |
+
"max_chunk_size": self.max_chunk_size,
|
| 396 |
+
"embedding_model": self._detect_embedding_model_type()
|
| 397 |
+
})
|
| 398 |
+
|
| 399 |
+
except ImportError:
|
| 400 |
+
# Streamlit not available, print to console
|
| 401 |
+
print("\n=== Semantic Chunking Statistics ===")
|
| 402 |
+
print(f"Documents processed: {self.stats['total_documents']}")
|
| 403 |
+
print(f"Chunks created: {self.stats['total_chunks']}")
|
| 404 |
+
print(f"Average chunk size: {self.stats['avg_chunk_size']:.0f} characters")
|
| 405 |
+
print(f"Embedding errors: {self.stats['embedding_errors']}")
|
| 406 |
+
print(f"Chunking methods: {self.stats['chunking_methods']}")
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
def create_semantic_chunker(embeddings_model, **kwargs) -> SemanticChunker:
|
| 410 |
+
"""
|
| 411 |
+
Convenience function to create a semantic chunker with sensible defaults.
|
| 412 |
+
|
| 413 |
+
Args:
|
| 414 |
+
embeddings_model: Your existing embeddings model
|
| 415 |
+
**kwargs: Additional parameters to pass to SemanticChunker
|
| 416 |
+
|
| 417 |
+
Returns:
|
| 418 |
+
SemanticChunker instance ready to use
|
| 419 |
+
"""
|
| 420 |
+
return SemanticChunker(embeddings_model=embeddings_model, **kwargs)
|
styles.css
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.main-header {
|
| 2 |
+
text-align: center;
|
| 3 |
+
padding: 2rem 0;
|
| 4 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 5 |
+
color: white;
|
| 6 |
+
margin: -1rem -1rem 2rem -1rem;
|
| 7 |
+
border-radius: 10px;
|
| 8 |
+
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
.stApp {
|
| 12 |
+
background: var(--background-color);
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
/* Dark theme compatible containers */
|
| 16 |
+
.query-result {
|
| 17 |
+
background: rgba(255, 255, 255, 0.05);
|
| 18 |
+
backdrop-filter: blur(10px);
|
| 19 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
| 20 |
+
padding: 1.5rem;
|
| 21 |
+
border-radius: 15px;
|
| 22 |
+
margin: 1rem 0;
|
| 23 |
+
color: var(--text-color);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.source-doc {
|
| 27 |
+
background: rgba(31, 119, 180, 0.1);
|
| 28 |
+
backdrop-filter: blur(5px);
|
| 29 |
+
padding: 1rem;
|
| 30 |
+
border-left: 4px solid #1f77b4;
|
| 31 |
+
border-radius: 8px;
|
| 32 |
+
margin: 0.5rem 0;
|
| 33 |
+
color: var(--text-color);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.share-link {
|
| 37 |
+
background: rgba(46, 204, 113, 0.1);
|
| 38 |
+
backdrop-filter: blur(5px);
|
| 39 |
+
padding: 1rem;
|
| 40 |
+
border-radius: 10px;
|
| 41 |
+
border-left: 4px solid #2ecc71;
|
| 42 |
+
color: var(--text-color);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
/* Model indicator boxes */
|
| 46 |
+
.model-info {
|
| 47 |
+
background: rgba(52, 152, 219, 0.15);
|
| 48 |
+
backdrop-filter: blur(10px);
|
| 49 |
+
padding: 15px;
|
| 50 |
+
border-radius: 12px;
|
| 51 |
+
border-left: 4px solid #3498db;
|
| 52 |
+
margin: 10px 0;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
/* Language selection enhancement */
|
| 56 |
+
.language-selection {
|
| 57 |
+
background: rgba(155, 89, 182, 0.1);
|
| 58 |
+
backdrop-filter: blur(10px);
|
| 59 |
+
padding: 15px;
|
| 60 |
+
border-radius: 12px;
|
| 61 |
+
border-left: 4px solid #9b59b6;
|
| 62 |
+
margin: 10px 0;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
/* Upload area enhancement */
|
| 66 |
+
.stFileUploader {
|
| 67 |
+
background: rgba(230, 126, 34, 0.1);
|
| 68 |
+
backdrop-filter: blur(10px);
|
| 69 |
+
padding: 20px;
|
| 70 |
+
border-radius: 15px;
|
| 71 |
+
border: 2px dashed #e67e22;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
.stFileUploader label {
|
| 75 |
+
font-size: 1.2rem;
|
| 76 |
+
font-weight: bold;
|
| 77 |
+
color: var(--text-color);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
/* Button enhancements */
|
| 81 |
+
.stButton > button {
|
| 82 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 83 |
+
color: white;
|
| 84 |
+
border: none;
|
| 85 |
+
border-radius: 10px;
|
| 86 |
+
padding: 0.6rem 1.5rem;
|
| 87 |
+
font-weight: 600;
|
| 88 |
+
transition: all 0.3s ease;
|
| 89 |
+
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.stButton > button:hover {
|
| 93 |
+
transform: translateY(-2px);
|
| 94 |
+
box-shadow: 0 6px 20px 0 rgba(31, 38, 135, 0.5);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
/* Sidebar enhancements */
|
| 98 |
+
.css-1d391kg {
|
| 99 |
+
background: rgba(255, 255, 255, 0.02);
|
| 100 |
+
backdrop-filter: blur(10px);
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
/* Info boxes */
|
| 104 |
+
.stInfo {
|
| 105 |
+
background: rgba(52, 152, 219, 0.1);
|
| 106 |
+
backdrop-filter: blur(10px);
|
| 107 |
+
border-left: 4px solid #3498db;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.stSuccess {
|
| 111 |
+
background: rgba(46, 204, 113, 0.1);
|
| 112 |
+
backdrop-filter: blur(10px);
|
| 113 |
+
border-left: 4px solid #2ecc71;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
.stWarning {
|
| 117 |
+
background: rgba(241, 196, 15, 0.1);
|
| 118 |
+
backdrop-filter: blur(10px);
|
| 119 |
+
border-left: 4px solid #f1c40f;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
.stError {
|
| 123 |
+
background: rgba(231, 76, 60, 0.1);
|
| 124 |
+
backdrop-filter: blur(10px);
|
| 125 |
+
border-left: 4px solid #e74c3c;
|
| 126 |
+
}
|
tabs/help.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Help tab functionality for the Gradio app
|
| 3 |
+
"""
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
def create_help_tab(global_vars):
|
| 7 |
+
"""Create the Help tab with comprehensive documentation"""
|
| 8 |
+
with gr.Tab("❓ Help", id="help"):
|
| 9 |
+
gr.Markdown("""
|
| 10 |
+
# 🌏 PANSEA University Requirements Assistant - User Guide
|
| 11 |
+
|
| 12 |
+
Welcome to the PANSEA (Pan-Southeast Asian) University Requirements Assistant! This tool helps you navigate university admission requirements across Southeast Asian countries using advanced AI-powered document analysis.
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## 🚀 Getting Started
|
| 17 |
+
|
| 18 |
+
### Step 1: Initialize the System
|
| 19 |
+
1. Go to the **🔧 Initialize** tab
|
| 20 |
+
2. Click **"Initialize All Systems"**
|
| 21 |
+
3. Wait for the success message
|
| 22 |
+
4. The system will set up AI models and document processing capabilities
|
| 23 |
+
|
| 24 |
+
### Step 2: Upload Documents
|
| 25 |
+
1. Navigate to the **📤 Upload Documents** tab
|
| 26 |
+
2. Select one or more PDF files containing university requirement information
|
| 27 |
+
3. Fill in the document metadata:
|
| 28 |
+
- **University Name**: Official name of the institution
|
| 29 |
+
- **Country**: Select from Southeast Asian countries
|
| 30 |
+
- **Document Type**: Choose the type of document
|
| 31 |
+
- **Language**: Document language
|
| 32 |
+
4. Click **"Process Documents"**
|
| 33 |
+
5. Wait for processing completion
|
| 34 |
+
|
| 35 |
+
### Step 3: Query Documents
|
| 36 |
+
1. Go to the **🔍 Query Documents** tab
|
| 37 |
+
2. Type your question in the query box
|
| 38 |
+
3. Click **"Search Documents"**
|
| 39 |
+
4. Review the AI-generated answer and source references
|
| 40 |
+
5. Use example questions to explore different types of queries
|
| 41 |
+
|
| 42 |
+
### Step 4: Manage Documents
|
| 43 |
+
1. Visit the **🗂 Manage Documents** tab
|
| 44 |
+
2. View all uploaded documents and statistics
|
| 45 |
+
3. Delete individual documents or clear all documents as needed
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## 📖 Features Overview
|
| 50 |
+
|
| 51 |
+
### 🤖 AI-Powered Analysis
|
| 52 |
+
- Uses advanced SEA-LION AI models optimized for Southeast Asian contexts
|
| 53 |
+
- Semantic search across your document collection
|
| 54 |
+
- Contextual answers with source citations
|
| 55 |
+
- Multi-language document support
|
| 56 |
+
|
| 57 |
+
### 📚 Document Management
|
| 58 |
+
- Support for PDF documents
|
| 59 |
+
- Intelligent text chunking for better search results
|
| 60 |
+
- Metadata tracking (university, country, document type, language)
|
| 61 |
+
- Easy document deletion and management
|
| 62 |
+
|
| 63 |
+
### 🌐 Regional Focus
|
| 64 |
+
- Specialized for Southeast Asian universities
|
| 65 |
+
- Supports multiple countries and languages
|
| 66 |
+
- Culturally aware responses
|
| 67 |
+
- Up-to-date admission requirement information
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
## 💡 Usage Tips
|
| 72 |
+
|
| 73 |
+
### Asking Better Questions
|
| 74 |
+
- **Be Specific**: "What are the English proficiency requirements for Computer Science at NUS?" instead of "What are the requirements?"
|
| 75 |
+
- **Include Context**: Mention specific programs, countries, or universities you're interested in
|
| 76 |
+
- **Use Keywords**: Include terms like "admission", "requirements", "GPA", "test scores", etc.
|
| 77 |
+
|
| 78 |
+
### Document Upload Best Practices
|
| 79 |
+
- **Quality Documents**: Upload official university brochures, requirement documents, or application guides
|
| 80 |
+
- **Accurate Metadata**: Fill in all metadata fields correctly for better search results
|
| 81 |
+
- **Regular Updates**: Replace outdated documents with current versions
|
| 82 |
+
- **Organized Approach**: Upload documents systematically by country or university
|
| 83 |
+
|
| 84 |
+
### Managing Your Knowledge Base
|
| 85 |
+
- **Regular Maintenance**: Remove outdated documents periodically
|
| 86 |
+
- **Logical Organization**: Group related documents together
|
| 87 |
+
- **Backup Important Queries**: Save important answers for future reference
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
## 🛠 Troubleshooting
|
| 92 |
+
|
| 93 |
+
### Common Issues
|
| 94 |
+
|
| 95 |
+
**Problem**: "Please initialize systems first" error
|
| 96 |
+
- **Solution**: Go to the Initialize tab and click "Initialize All Systems"
|
| 97 |
+
|
| 98 |
+
**Problem**: Document upload fails
|
| 99 |
+
- **Solution**: Ensure PDF files are not corrupted and contain text (not just images)
|
| 100 |
+
|
| 101 |
+
**Problem**: No search results
|
| 102 |
+
- **Solution**: Check if documents are uploaded and try different keywords
|
| 103 |
+
|
| 104 |
+
**Problem**: Slow performance
|
| 105 |
+
- **Solution**: Wait for processing to complete, avoid uploading too many large documents at once
|
| 106 |
+
|
| 107 |
+
### Technical Requirements
|
| 108 |
+
- **File Format**: PDF documents only
|
| 109 |
+
- **File Size**: Reasonable size limits (avoid extremely large files)
|
| 110 |
+
- **Content**: Text-based PDFs work best (scanned images may not work well)
|
| 111 |
+
- **Internet**: Required for AI model access
|
| 112 |
+
|
| 113 |
+
---
|
| 114 |
+
|
| 115 |
+
## 📊 Understanding Results
|
| 116 |
+
|
| 117 |
+
### Query Responses
|
| 118 |
+
- **Answer**: AI-generated response based on your documents
|
| 119 |
+
- **Sources**: Specific document chunks used to generate the answer
|
| 120 |
+
- **Confidence**: Implied by the specificity and detail of the response
|
| 121 |
+
- **Context**: Related information that might be helpful
|
| 122 |
+
|
| 123 |
+
### Document Statistics
|
| 124 |
+
- **Total Documents**: Number of unique documents uploaded
|
| 125 |
+
- **Total Chunks**: Number of text segments for searching
|
| 126 |
+
- **Metadata**: Information about each document's origin and type
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
## 🌟 Best Practices for University Research
|
| 131 |
+
|
| 132 |
+
### Research Strategy
|
| 133 |
+
1. **Start Broad**: Upload general university information first
|
| 134 |
+
2. **Get Specific**: Add detailed program requirements
|
| 135 |
+
3. **Compare Options**: Query for comparisons between universities
|
| 136 |
+
4. **Verify Information**: Cross-reference with official university websites
|
| 137 |
+
|
| 138 |
+
### Question Types to Try
|
| 139 |
+
- **Admission Requirements**: "What are the minimum GPA requirements for..."
|
| 140 |
+
- **Test Scores**: "What IELTS/TOEFL scores are needed for..."
|
| 141 |
+
- **Application Deadlines**: "When is the application deadline for..."
|
| 142 |
+
- **Program Details**: "What courses are included in the... program at..."
|
| 143 |
+
- **Scholarships**: "What scholarship opportunities are available for..."
|
| 144 |
+
|
| 145 |
+
---
|
| 146 |
+
|
| 147 |
+
## 🆘 Support & Feedback
|
| 148 |
+
|
| 149 |
+
If you encounter issues or have suggestions for improvement:
|
| 150 |
+
|
| 151 |
+
1. **Check Documentation**: Review this help section first
|
| 152 |
+
2. **Try Different Approaches**: Rephrase your queries or check document formats
|
| 153 |
+
3. **Document Issues**: Note specific error messages or unexpected behavior
|
| 154 |
+
4. **Feature Requests**: Consider what additional functionality would be helpful
|
| 155 |
+
|
| 156 |
+
---
|
| 157 |
+
|
| 158 |
+
## 🔄 Version Information
|
| 159 |
+
|
| 160 |
+
**Current Version**: Gradio-based PANSEA Assistant
|
| 161 |
+
**AI Models**: SEA-LION optimized for Southeast Asian contexts
|
| 162 |
+
**Document Processing**: Advanced semantic chunking and embedding
|
| 163 |
+
**Search Technology**: Vector similarity search with contextual ranking
|
| 164 |
+
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
*Happy university hunting! 🎓 We hope this tool helps you find the perfect educational opportunity in Southeast Asia.*
|
| 168 |
+
""")
|
tabs/initialize.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Initialize tab functionality for the Gradio app
|
| 3 |
+
"""
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from utils.rag_system import DocumentIngestion, RAGSystem
|
| 6 |
+
|
| 7 |
+
def initialize_systems(global_vars):
|
| 8 |
+
"""Initialize the RAG systems"""
|
| 9 |
+
try:
|
| 10 |
+
print("🚀 Initializing document ingestion system...")
|
| 11 |
+
global_vars['doc_ingestion'] = DocumentIngestion()
|
| 12 |
+
print("🚀 Initializing RAG system...")
|
| 13 |
+
global_vars['rag_system'] = RAGSystem()
|
| 14 |
+
return "✅ Systems initialized successfully! You can now upload documents."
|
| 15 |
+
except Exception as e:
|
| 16 |
+
error_msg = f"❌ Error initializing systems: {str(e)}\n\n"
|
| 17 |
+
|
| 18 |
+
if "sentence-transformers" in str(e):
|
| 19 |
+
error_msg += """
|
| 20 |
+
**Possible solutions:**
|
| 21 |
+
1. Install sentence-transformers: `pip install sentence-transformers`
|
| 22 |
+
2. Or provide OpenAI API key in environment variables
|
| 23 |
+
3. Check that PyTorch is properly installed
|
| 24 |
+
|
| 25 |
+
**For deployment:**
|
| 26 |
+
- Ensure requirements.txt includes: sentence-transformers, torch, transformers
|
| 27 |
+
"""
|
| 28 |
+
return error_msg
|
| 29 |
+
|
| 30 |
+
def create_initialize_tab(global_vars):
|
| 31 |
+
"""Create the Initialize System tab"""
|
| 32 |
+
with gr.Tab("🚀 Initialize System", id="init"):
|
| 33 |
+
gr.Markdown("""
|
| 34 |
+
### Step 1: Initialize the System
|
| 35 |
+
Click the button below to initialize the AI models and embedding systems.
|
| 36 |
+
This may take a few moments on first run as models are downloaded.
|
| 37 |
+
""")
|
| 38 |
+
|
| 39 |
+
init_btn = gr.Button(
|
| 40 |
+
"🚀 Initialize Systems",
|
| 41 |
+
variant="primary",
|
| 42 |
+
size="lg"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
init_status = gr.Textbox(
|
| 46 |
+
label="Initialization Status",
|
| 47 |
+
interactive=False,
|
| 48 |
+
lines=8,
|
| 49 |
+
placeholder="Click 'Initialize Systems' to start..."
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
init_btn.click(
|
| 53 |
+
lambda: initialize_systems(global_vars),
|
| 54 |
+
outputs=init_status
|
| 55 |
+
)
|
tabs/manage.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Manage documents tab functionality for the Gradio app
|
| 3 |
+
"""
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
def manage_documents(global_vars):
|
| 7 |
+
"""Manage uploaded documents - view, delete individual or all documents"""
|
| 8 |
+
doc_ingestion = global_vars.get('doc_ingestion')
|
| 9 |
+
|
| 10 |
+
if not doc_ingestion:
|
| 11 |
+
return "❌ Please initialize systems first!", "", ""
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
vectorstore = doc_ingestion.load_existing_vectorstore()
|
| 15 |
+
|
| 16 |
+
if not vectorstore:
|
| 17 |
+
return "⚠️ No documents found. Upload documents first.", "", ""
|
| 18 |
+
|
| 19 |
+
# Get all documents from vectorstore
|
| 20 |
+
collection = vectorstore._collection
|
| 21 |
+
all_docs = collection.get(include=["metadatas", "documents"])
|
| 22 |
+
metadatas = all_docs["metadatas"]
|
| 23 |
+
ids = all_docs["ids"]
|
| 24 |
+
documents = all_docs["documents"]
|
| 25 |
+
|
| 26 |
+
# Group by file_id to show unique documents
|
| 27 |
+
doc_map = {}
|
| 28 |
+
for meta, doc_id, doc_text in zip(metadatas, ids, documents):
|
| 29 |
+
file_id = meta.get("file_id", doc_id)
|
| 30 |
+
if file_id not in doc_map:
|
| 31 |
+
doc_map[file_id] = {
|
| 32 |
+
"source": meta.get("source", "Unknown"),
|
| 33 |
+
"university": meta.get("university", "Unknown"),
|
| 34 |
+
"country": meta.get("country", "Unknown"),
|
| 35 |
+
"document_type": meta.get("document_type", "Unknown"),
|
| 36 |
+
"language": meta.get("language", "Unknown"),
|
| 37 |
+
"upload_timestamp": meta.get("upload_timestamp", "Unknown"),
|
| 38 |
+
"file_id": file_id,
|
| 39 |
+
"chunks": []
|
| 40 |
+
}
|
| 41 |
+
doc_map[file_id]["chunks"].append(doc_text)
|
| 42 |
+
|
| 43 |
+
if not doc_map:
|
| 44 |
+
return "ℹ️ No documents found in the system.", "", ""
|
| 45 |
+
|
| 46 |
+
# Create summary
|
| 47 |
+
total_documents = len(doc_map)
|
| 48 |
+
total_chunks = sum(len(info["chunks"]) for info in doc_map.values())
|
| 49 |
+
|
| 50 |
+
summary = f"""## 📊 Document Statistics
|
| 51 |
+
|
| 52 |
+
**Total Documents:** {total_documents}
|
| 53 |
+
**Total Text Chunks:** {total_chunks}
|
| 54 |
+
**Storage Status:** Active
|
| 55 |
+
|
| 56 |
+
## 📚 Document List
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
# Create document list with details
|
| 60 |
+
document_list = ""
|
| 61 |
+
file_id_list = []
|
| 62 |
+
|
| 63 |
+
for i, (file_id, info) in enumerate(doc_map.items(), 1):
|
| 64 |
+
timestamp = info['upload_timestamp'][:19] if len(info['upload_timestamp']) > 19 else info['upload_timestamp']
|
| 65 |
+
|
| 66 |
+
document_list += f"""
|
| 67 |
+
**{i}. {info['source']}**
|
| 68 |
+
- University: {info['university']}
|
| 69 |
+
- Country: {info['country']}
|
| 70 |
+
- Type: {info['document_type']}
|
| 71 |
+
- Language: {info['language']}
|
| 72 |
+
- Chunks: {len(info['chunks'])}
|
| 73 |
+
- Uploaded: {timestamp}
|
| 74 |
+
- File ID: `{file_id}`
|
| 75 |
+
|
| 76 |
+
---
|
| 77 |
+
"""
|
| 78 |
+
file_id_list.append(file_id)
|
| 79 |
+
|
| 80 |
+
# Create dropdown options for individual deletion
|
| 81 |
+
file_options = [f"{info['source']} ({info['university']})" for info in doc_map.values()]
|
| 82 |
+
|
| 83 |
+
return summary, document_list, file_options
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
return f"❌ Error loading documents: {str(e)}", "", []
|
| 87 |
+
|
| 88 |
+
def delete_document(selected_file, current_doc_list, global_vars):
|
| 89 |
+
"""Delete a specific document"""
|
| 90 |
+
doc_ingestion = global_vars.get('doc_ingestion')
|
| 91 |
+
|
| 92 |
+
if not doc_ingestion or not selected_file:
|
| 93 |
+
return "❌ Please select a document to delete.", current_doc_list
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
vectorstore = doc_ingestion.load_existing_vectorstore()
|
| 97 |
+
if not vectorstore:
|
| 98 |
+
return "❌ No vectorstore found.", current_doc_list
|
| 99 |
+
|
| 100 |
+
# Get all documents and find the matching file_id
|
| 101 |
+
collection = vectorstore._collection
|
| 102 |
+
all_docs = collection.get(include=["metadatas"])
|
| 103 |
+
metadatas = all_docs["metadatas"]
|
| 104 |
+
ids = all_docs["ids"]
|
| 105 |
+
|
| 106 |
+
# Find file_id for the selected document
|
| 107 |
+
target_file_id = None
|
| 108 |
+
for meta, doc_id in zip(metadatas, ids):
|
| 109 |
+
source = meta.get("source", "Unknown")
|
| 110 |
+
university = meta.get("university", "Unknown")
|
| 111 |
+
if f"{source} ({university})" == selected_file:
|
| 112 |
+
target_file_id = meta.get("file_id", doc_id)
|
| 113 |
+
break
|
| 114 |
+
|
| 115 |
+
if not target_file_id:
|
| 116 |
+
return "❌ Document not found.", current_doc_list
|
| 117 |
+
|
| 118 |
+
# Delete all chunks with this file_id
|
| 119 |
+
ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == target_file_id]
|
| 120 |
+
collection.delete(ids=ids_to_delete)
|
| 121 |
+
|
| 122 |
+
# Refresh the document list
|
| 123 |
+
_, new_doc_list, _ = manage_documents(global_vars)
|
| 124 |
+
|
| 125 |
+
return f"✅ Successfully deleted document: {selected_file}", new_doc_list
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
return f"❌ Error deleting document: {str(e)}", current_doc_list
|
| 129 |
+
|
| 130 |
+
def delete_all_documents(global_vars):
|
| 131 |
+
"""Delete all documents from the vectorstore"""
|
| 132 |
+
doc_ingestion = global_vars.get('doc_ingestion')
|
| 133 |
+
|
| 134 |
+
if not doc_ingestion:
|
| 135 |
+
return "❌ Please initialize systems first.", ""
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
vectorstore_instance = doc_ingestion.load_existing_vectorstore()
|
| 139 |
+
if not vectorstore_instance:
|
| 140 |
+
return "⚠️ No documents found to delete.", ""
|
| 141 |
+
|
| 142 |
+
# Get all document IDs
|
| 143 |
+
collection = vectorstore_instance._collection
|
| 144 |
+
all_docs = collection.get()
|
| 145 |
+
all_ids = all_docs["ids"]
|
| 146 |
+
|
| 147 |
+
# Delete all documents
|
| 148 |
+
if all_ids:
|
| 149 |
+
collection.delete(ids=all_ids)
|
| 150 |
+
# Clear global vectorstore
|
| 151 |
+
global_vars['vectorstore'] = None
|
| 152 |
+
return f"✅ Successfully deleted all {len(all_ids)} document chunks.", ""
|
| 153 |
+
else:
|
| 154 |
+
return "ℹ️ No documents found to delete.", ""
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
return f"❌ Error deleting all documents: {str(e)}", ""
|
| 158 |
+
|
| 159 |
+
def create_manage_tab(global_vars):
|
| 160 |
+
"""Create the Manage Documents tab"""
|
| 161 |
+
with gr.Tab("🗂 Manage Documents", id="manage"):
|
| 162 |
+
gr.Markdown("""
|
| 163 |
+
### Step 4: Manage Your Documents
|
| 164 |
+
View, inspect, and manage all uploaded documents in your knowledge base.
|
| 165 |
+
You can see document details and delete individual documents or all documents.
|
| 166 |
+
""")
|
| 167 |
+
|
| 168 |
+
# Buttons for actions
|
| 169 |
+
with gr.Row():
|
| 170 |
+
refresh_btn = gr.Button("🔄 Refresh Document List", variant="secondary")
|
| 171 |
+
delete_all_btn = gr.Button("🗑️ Delete All Documents", variant="stop")
|
| 172 |
+
|
| 173 |
+
# Document statistics and list
|
| 174 |
+
doc_summary = gr.Markdown(
|
| 175 |
+
value="📊 Click 'Refresh Document List' to view your documents.",
|
| 176 |
+
label="Document Summary"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
doc_list = gr.Markdown(
|
| 180 |
+
value="📚 Document details will appear here after refresh.",
|
| 181 |
+
label="Document List"
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# Individual document deletion
|
| 185 |
+
gr.Markdown("### 🗑️ Delete Individual Document")
|
| 186 |
+
|
| 187 |
+
with gr.Row():
|
| 188 |
+
file_selector = gr.Dropdown(
|
| 189 |
+
choices=[],
|
| 190 |
+
label="Select Document to Delete",
|
| 191 |
+
interactive=True,
|
| 192 |
+
info="First click 'Refresh Document List' to see available documents"
|
| 193 |
+
)
|
| 194 |
+
delete_single_btn = gr.Button("🗑️ Delete Selected", variant="stop")
|
| 195 |
+
|
| 196 |
+
delete_status = gr.Textbox(
|
| 197 |
+
label="Action Status",
|
| 198 |
+
interactive=False,
|
| 199 |
+
lines=2,
|
| 200 |
+
placeholder="Deletion status will appear here..."
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
# Event handlers
|
| 204 |
+
def refresh_documents():
|
| 205 |
+
summary, documents, file_options = manage_documents(global_vars)
|
| 206 |
+
# Update dropdown choices
|
| 207 |
+
return summary, documents, gr.Dropdown(choices=file_options, value=None)
|
| 208 |
+
|
| 209 |
+
def delete_selected_document(selected_file, current_list):
|
| 210 |
+
if not selected_file:
|
| 211 |
+
return "❌ Please select a document to delete first.", current_list, gr.Dropdown(choices=[])
|
| 212 |
+
|
| 213 |
+
status, new_list = delete_document(selected_file, current_list, global_vars)
|
| 214 |
+
# Also refresh the file options after deletion
|
| 215 |
+
_, _, new_options = manage_documents(global_vars)
|
| 216 |
+
return status, new_list, gr.Dropdown(choices=new_options, value=None)
|
| 217 |
+
|
| 218 |
+
def delete_all_docs():
|
| 219 |
+
status, empty_list = delete_all_documents(global_vars)
|
| 220 |
+
return status, "📚 No documents in the system.", gr.Dropdown(choices=[], value=None)
|
| 221 |
+
|
| 222 |
+
# Connect event handlers
|
| 223 |
+
refresh_btn.click(
|
| 224 |
+
refresh_documents,
|
| 225 |
+
outputs=[doc_summary, doc_list, file_selector]
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
delete_single_btn.click(
|
| 229 |
+
delete_selected_document,
|
| 230 |
+
inputs=[file_selector, doc_list],
|
| 231 |
+
outputs=[delete_status, doc_list, file_selector]
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
delete_all_btn.click(
|
| 235 |
+
delete_all_docs,
|
| 236 |
+
outputs=[delete_status, doc_list, file_selector]
|
| 237 |
+
)
|
tabs/query.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Query documents tab functionality for the Gradio app
|
| 3 |
+
"""
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
def query_documents(question, language, global_vars):
|
| 7 |
+
"""Handle document queries"""
|
| 8 |
+
rag_system = global_vars.get('rag_system')
|
| 9 |
+
vectorstore = global_vars.get('vectorstore')
|
| 10 |
+
|
| 11 |
+
if not rag_system:
|
| 12 |
+
return "❌ Please initialize systems first using the 'Initialize System' tab!"
|
| 13 |
+
|
| 14 |
+
if not vectorstore:
|
| 15 |
+
return "❌ Please upload and process documents first using the 'Upload Documents' tab!"
|
| 16 |
+
|
| 17 |
+
if not question.strip():
|
| 18 |
+
return "❌ Please enter a question."
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
print(f"🔍 Processing query: {question}")
|
| 22 |
+
result = rag_system.query(question, language)
|
| 23 |
+
|
| 24 |
+
# Format response
|
| 25 |
+
answer = result["answer"]
|
| 26 |
+
sources = result.get("source_documents", [])
|
| 27 |
+
model_used = result.get("model_used", "SEA-LION")
|
| 28 |
+
|
| 29 |
+
# Add model information
|
| 30 |
+
response = f"**Model Used:** {model_used}\n\n"
|
| 31 |
+
response += f"**Answer:**\n{answer}\n\n"
|
| 32 |
+
|
| 33 |
+
if sources:
|
| 34 |
+
response += "**📚 Sources:**\n"
|
| 35 |
+
for i, doc in enumerate(sources[:3], 1):
|
| 36 |
+
metadata = doc.metadata
|
| 37 |
+
source_name = metadata.get('source', 'Unknown')
|
| 38 |
+
university = metadata.get('university', 'Unknown')
|
| 39 |
+
country = metadata.get('country', 'Unknown')
|
| 40 |
+
doc_type = metadata.get('document_type', 'Unknown')
|
| 41 |
+
|
| 42 |
+
response += f"{i}. **{source_name}**\n"
|
| 43 |
+
response += f" - University: {university}\n"
|
| 44 |
+
response += f" - Country: {country}\n"
|
| 45 |
+
response += f" - Type: {doc_type}\n"
|
| 46 |
+
response += f" - Preview: {doc.page_content[:150]}...\n\n"
|
| 47 |
+
else:
|
| 48 |
+
response += "\n*No specific sources found. This might be a general response.*"
|
| 49 |
+
|
| 50 |
+
return response
|
| 51 |
+
|
| 52 |
+
except Exception as e:
|
| 53 |
+
return f"❌ Error querying documents: {str(e)}\n\nPlease check the console for more details."
|
| 54 |
+
|
| 55 |
+
def get_example_questions():
|
| 56 |
+
"""Return example questions for the interface"""
|
| 57 |
+
return [
|
| 58 |
+
"What are the admission requirements for Computer Science programs in Singapore?",
|
| 59 |
+
"Which universities offer scholarships for international students?",
|
| 60 |
+
"What are the tuition fees for MBA programs in Thailand?",
|
| 61 |
+
"Find universities with engineering programs under $5000 per year",
|
| 62 |
+
"What are the application deadlines for programs in Malaysia?",
|
| 63 |
+
"Compare admission requirements between different ASEAN countries"
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
def create_query_tab(global_vars):
|
| 67 |
+
"""Create the Search & Query tab"""
|
| 68 |
+
with gr.Tab("🔍 Search & Query", id="query"):
|
| 69 |
+
gr.Markdown("""
|
| 70 |
+
### Step 3: Ask Questions
|
| 71 |
+
Ask questions about the uploaded documents in your preferred language.
|
| 72 |
+
The AI will provide detailed answers with source citations.
|
| 73 |
+
""")
|
| 74 |
+
|
| 75 |
+
with gr.Row():
|
| 76 |
+
with gr.Column(scale=3):
|
| 77 |
+
question_input = gr.Textbox(
|
| 78 |
+
label="💭 Your Question",
|
| 79 |
+
placeholder="Ask anything about the universities...",
|
| 80 |
+
lines=3
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
with gr.Column(scale=1):
|
| 84 |
+
language_dropdown = gr.Dropdown(
|
| 85 |
+
choices=[
|
| 86 |
+
"English", "Chinese", "Malay", "Thai",
|
| 87 |
+
"Indonesian", "Vietnamese", "Filipino"
|
| 88 |
+
],
|
| 89 |
+
value="English",
|
| 90 |
+
label="🌍 Response Language"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
query_btn = gr.Button(
|
| 94 |
+
"🔍 Search Documents",
|
| 95 |
+
variant="primary",
|
| 96 |
+
size="lg"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
answer_output = gr.Textbox(
|
| 100 |
+
label="🤖 AI Response",
|
| 101 |
+
interactive=False,
|
| 102 |
+
lines=20,
|
| 103 |
+
placeholder="Ask a question to get AI-powered answers..."
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Example questions section
|
| 107 |
+
gr.Markdown("### 💡 Example Questions")
|
| 108 |
+
example_questions = get_example_questions()
|
| 109 |
+
|
| 110 |
+
with gr.Row():
|
| 111 |
+
for i in range(0, len(example_questions), 2):
|
| 112 |
+
with gr.Column():
|
| 113 |
+
if i < len(example_questions):
|
| 114 |
+
example_btn = gr.Button(
|
| 115 |
+
example_questions[i],
|
| 116 |
+
size="sm",
|
| 117 |
+
variant="secondary"
|
| 118 |
+
)
|
| 119 |
+
example_btn.click(
|
| 120 |
+
lambda x=example_questions[i]: x,
|
| 121 |
+
outputs=question_input
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if i + 1 < len(example_questions):
|
| 125 |
+
example_btn2 = gr.Button(
|
| 126 |
+
example_questions[i + 1],
|
| 127 |
+
size="sm",
|
| 128 |
+
variant="secondary"
|
| 129 |
+
)
|
| 130 |
+
example_btn2.click(
|
| 131 |
+
lambda x=example_questions[i + 1]: x,
|
| 132 |
+
outputs=question_input
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
query_btn.click(
|
| 136 |
+
lambda question, language: query_documents(question, language, global_vars),
|
| 137 |
+
inputs=[question_input, language_dropdown],
|
| 138 |
+
outputs=answer_output
|
| 139 |
+
)
|
tabs/upload.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Upload documents tab functionality for the Gradio app
|
| 3 |
+
"""
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
def upload_documents(files, global_vars):
|
| 7 |
+
"""Handle document upload and processing"""
|
| 8 |
+
doc_ingestion = global_vars.get('doc_ingestion')
|
| 9 |
+
|
| 10 |
+
if not doc_ingestion:
|
| 11 |
+
return "❌ Please initialize systems first using the 'Initialize System' tab!"
|
| 12 |
+
|
| 13 |
+
if not files:
|
| 14 |
+
return "❌ Please upload at least one PDF file."
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
# Filter for PDF files only
|
| 18 |
+
pdf_files = []
|
| 19 |
+
for file_path in files:
|
| 20 |
+
if file_path.endswith('.pdf'):
|
| 21 |
+
pdf_files.append(file_path)
|
| 22 |
+
|
| 23 |
+
if not pdf_files:
|
| 24 |
+
return "❌ Please upload PDF files only."
|
| 25 |
+
|
| 26 |
+
print(f"📄 Processing {len(pdf_files)} PDF file(s)...")
|
| 27 |
+
|
| 28 |
+
# Process documents
|
| 29 |
+
documents = doc_ingestion.process_documents(pdf_files)
|
| 30 |
+
|
| 31 |
+
if documents:
|
| 32 |
+
print("🔗 Creating vector store...")
|
| 33 |
+
# Create vector store
|
| 34 |
+
vectorstore = doc_ingestion.create_vector_store(documents)
|
| 35 |
+
|
| 36 |
+
if vectorstore:
|
| 37 |
+
# Store vectorstore in global vars
|
| 38 |
+
global_vars['vectorstore'] = vectorstore
|
| 39 |
+
|
| 40 |
+
# Create summary
|
| 41 |
+
summary = f"✅ Successfully processed {len(documents)} document(s):\n\n"
|
| 42 |
+
|
| 43 |
+
for i, doc in enumerate(documents, 1):
|
| 44 |
+
metadata = doc.metadata
|
| 45 |
+
university = metadata.get('university', 'Unknown')
|
| 46 |
+
country = metadata.get('country', 'Unknown')
|
| 47 |
+
doc_type = metadata.get('document_type', 'Unknown')
|
| 48 |
+
language = metadata.get('language', 'Unknown')
|
| 49 |
+
|
| 50 |
+
summary += f"{i}. **{metadata['source']}**\n"
|
| 51 |
+
summary += f" - University: {university}\n"
|
| 52 |
+
summary += f" - Country: {country}\n"
|
| 53 |
+
summary += f" - Type: {doc_type}\n"
|
| 54 |
+
summary += f" - Language: {language}\n\n"
|
| 55 |
+
|
| 56 |
+
summary += "🎉 **Ready for queries!** Go to the 'Search & Query' tab to start asking questions."
|
| 57 |
+
return summary
|
| 58 |
+
else:
|
| 59 |
+
return "❌ Failed to create vector store from documents."
|
| 60 |
+
else:
|
| 61 |
+
return "❌ No documents were successfully processed. Please check if your PDFs are readable."
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
return f"❌ Error processing documents: {str(e)}\n\nPlease check the console for more details."
|
| 65 |
+
|
| 66 |
+
def create_upload_tab(global_vars):
|
| 67 |
+
"""Create the Upload Documents tab"""
|
| 68 |
+
with gr.Tab("📄 Upload Documents", id="upload"):
|
| 69 |
+
gr.Markdown("""
|
| 70 |
+
### Step 2: Upload PDF Documents
|
| 71 |
+
Upload university documents (brochures, admission guides, etc.) in PDF format.
|
| 72 |
+
The system will automatically extract metadata including university name, country, and document type.
|
| 73 |
+
""")
|
| 74 |
+
|
| 75 |
+
file_upload = gr.File(
|
| 76 |
+
label="📁 Upload PDF Documents",
|
| 77 |
+
file_types=[".pdf"],
|
| 78 |
+
file_count="multiple",
|
| 79 |
+
height=120
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
upload_btn = gr.Button(
|
| 83 |
+
"📄 Process Documents",
|
| 84 |
+
variant="primary",
|
| 85 |
+
size="lg"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
upload_status = gr.Textbox(
|
| 89 |
+
label="Processing Status",
|
| 90 |
+
interactive=False,
|
| 91 |
+
lines=12,
|
| 92 |
+
placeholder="Upload PDF files and click 'Process Documents'..."
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
upload_btn.click(
|
| 96 |
+
lambda files: upload_documents(files, global_vars),
|
| 97 |
+
inputs=file_upload,
|
| 98 |
+
outputs=upload_status
|
| 99 |
+
)
|
utils/display.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from utils.rag_system import DocumentIngestion, RAGSystem, save_query_result, load_shared_query
|
| 3 |
+
|
| 4 |
+
def display_query_result(result, show_share_link=False):
|
| 5 |
+
"""Display query results in a formatted way."""
|
| 6 |
+
st.markdown('<div class="query-result">', unsafe_allow_html=True)
|
| 7 |
+
|
| 8 |
+
# Show which model was used
|
| 9 |
+
if result.get("model_used"):
|
| 10 |
+
st.info(f"🤖 **Model Used:** {result['model_used']}")
|
| 11 |
+
|
| 12 |
+
st.subheader("🎯 Answer")
|
| 13 |
+
st.write(result["answer"])
|
| 14 |
+
|
| 15 |
+
# Share link
|
| 16 |
+
if show_share_link and result.get("query_id"):
|
| 17 |
+
st.markdown("---")
|
| 18 |
+
current_url = st.get_option("browser.serverAddress") or "localhost:8501"
|
| 19 |
+
share_url = f"http://{current_url}?share={result['query_id']}"
|
| 20 |
+
st.markdown(f"""
|
| 21 |
+
<div class="share-link">
|
| 22 |
+
<strong>🔗 Share this result:</strong><br>
|
| 23 |
+
<code>{share_url}</code>
|
| 24 |
+
</div>
|
| 25 |
+
""", unsafe_allow_html=True)
|
| 26 |
+
|
| 27 |
+
if st.button("📋 Copy Share Link"):
|
| 28 |
+
st.code(share_url)
|
| 29 |
+
|
| 30 |
+
# Source documents
|
| 31 |
+
if result.get("source_documents"):
|
| 32 |
+
st.markdown("---")
|
| 33 |
+
st.subheader("📚 Sources")
|
| 34 |
+
for i, doc in enumerate(result["source_documents"], 1):
|
| 35 |
+
with st.expander(f"Source {i}: {doc.metadata.get('source', 'Unknown')}"):
|
| 36 |
+
col1, col2 = st.columns([1, 2])
|
| 37 |
+
with col1:
|
| 38 |
+
st.write(f"**University:** {doc.metadata.get('university', 'Unknown')}")
|
| 39 |
+
st.write(f"**Country:** {doc.metadata.get('country', 'Unknown')}")
|
| 40 |
+
st.write(f"**Type:** {doc.metadata.get('document_type', 'Unknown')}")
|
| 41 |
+
with col2:
|
| 42 |
+
st.write("**Relevant Content:**")
|
| 43 |
+
content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
|
| 44 |
+
st.write(content_preview)
|
| 45 |
+
|
| 46 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 47 |
+
|
| 48 |
+
def display_shared_query(query_id):
|
| 49 |
+
"""Display a shared query result."""
|
| 50 |
+
st.header("🔗 Shared Query Result")
|
| 51 |
+
|
| 52 |
+
result_data = load_shared_query(query_id)
|
| 53 |
+
|
| 54 |
+
if result_data:
|
| 55 |
+
st.info(f"**Original Question:** {result_data['question']}")
|
| 56 |
+
st.write(f"**Language:** {result_data['language']}")
|
| 57 |
+
st.write(f"**Date:** {result_data['timestamp'][:10]}")
|
| 58 |
+
|
| 59 |
+
# Create a mock result object for display
|
| 60 |
+
mock_result = {
|
| 61 |
+
"answer": result_data["answer"],
|
| 62 |
+
"source_documents": [
|
| 63 |
+
type('MockDoc', (), {
|
| 64 |
+
'metadata': source,
|
| 65 |
+
'page_content': source.get('content_preview', '')
|
| 66 |
+
})() for source in result_data.get('sources', [])
|
| 67 |
+
]
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
display_query_result(mock_result, show_share_link=False)
|
| 71 |
+
|
| 72 |
+
if st.button("🔍 Ask Your Own Question"):
|
| 73 |
+
st.experimental_set_query_params()
|
| 74 |
+
st.experimental_rerun()
|
| 75 |
+
else:
|
| 76 |
+
st.error("❌ Shared query not found or has expired.")
|
| 77 |
+
if st.button("🏠 Go to Home"):
|
| 78 |
+
st.experimental_set_query_params()
|
| 79 |
+
st.experimental_rerun()
|
utils/rag_system.py
ADDED
|
@@ -0,0 +1,615 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
import tempfile
|
| 4 |
+
from typing import List, Optional, Dict, Any
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import PyPDF2
|
| 7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
+
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 9 |
+
from langchain_community.vectorstores import Chroma
|
| 10 |
+
from langchain.chains import RetrievalQA
|
| 11 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 12 |
+
from langchain.schema import Document
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import json
|
| 17 |
+
import base64
|
| 18 |
+
from openai import OpenAI
|
| 19 |
+
import re
|
| 20 |
+
from semantic_chunking import SemanticChunker
|
| 21 |
+
|
| 22 |
+
# Load environment variables
|
| 23 |
+
load_dotenv()
|
| 24 |
+
|
| 25 |
+
class AlternativeEmbeddings:
|
| 26 |
+
"""Alternative embeddings using Sentence Transformers when OpenAI is not available"""
|
| 27 |
+
|
| 28 |
+
def __init__(self):
|
| 29 |
+
self.model = None
|
| 30 |
+
self.embedding_size = 384
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
from sentence_transformers import SentenceTransformer
|
| 34 |
+
|
| 35 |
+
# Try smaller models in order of preference for better cloud compatibility
|
| 36 |
+
model_options = [
|
| 37 |
+
("all-MiniLM-L6-v2", 384), # Very small and reliable
|
| 38 |
+
("paraphrase-MiniLM-L3-v2", 384), # Even smaller
|
| 39 |
+
("BAAI/bge-small-en-v1.5", 384) # Original choice
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
for model_name, embed_size in model_options:
|
| 43 |
+
try:
|
| 44 |
+
print(f"🔄 Trying to load model: {model_name}")
|
| 45 |
+
self.model = SentenceTransformer(model_name)
|
| 46 |
+
self.embedding_size = embed_size
|
| 47 |
+
print(f"✅ Successfully loaded: {model_name}")
|
| 48 |
+
break
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"⚠️ Failed to load {model_name}: {str(e)}")
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
if not self.model:
|
| 54 |
+
raise Exception("All embedding models failed to load")
|
| 55 |
+
|
| 56 |
+
except ImportError:
|
| 57 |
+
print("❌ sentence-transformers not available. Please install it or provide OpenAI API key.")
|
| 58 |
+
raise ImportError("sentence-transformers not available")
|
| 59 |
+
|
| 60 |
+
def embed_documents(self, texts):
|
| 61 |
+
if not self.model:
|
| 62 |
+
raise Exception("No embedding model available")
|
| 63 |
+
try:
|
| 64 |
+
return self.model.encode(texts, convert_to_numpy=True).tolist()
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"Error encoding documents: {e}")
|
| 67 |
+
raise
|
| 68 |
+
|
| 69 |
+
def embed_query(self, text):
|
| 70 |
+
if not self.model:
|
| 71 |
+
raise Exception("No embedding model available")
|
| 72 |
+
try:
|
| 73 |
+
return self.model.encode([text], convert_to_numpy=True)[0].tolist()
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"Error encoding query: {e}")
|
| 76 |
+
raise
|
| 77 |
+
|
| 78 |
+
class SEALionLLM:
|
| 79 |
+
"""Custom LLM class for SEA-LION models"""
|
| 80 |
+
|
| 81 |
+
def __init__(self):
|
| 82 |
+
self.client = OpenAI(
|
| 83 |
+
api_key=os.getenv("SEA_LION_API_KEY"),
|
| 84 |
+
base_url=os.getenv("SEA_LION_BASE_URL", "https://api.sea-lion.ai/v1")
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Model configurations
|
| 88 |
+
self.instruct_model = "aisingapore/Gemma-SEA-LION-v3-9B-IT"
|
| 89 |
+
self.reasoning_model = "aisingapore/Llama-SEA-LION-v3.5-8B-R"
|
| 90 |
+
|
| 91 |
+
def _is_complex_query(self, query: str) -> bool:
|
| 92 |
+
"""Determine if query requires reasoning model or simple instruct model"""
|
| 93 |
+
# Keywords that indicate complex university search queries
|
| 94 |
+
complex_keywords = [
|
| 95 |
+
"university", "admission", "requirement", "tuition", "fee", "program", "course",
|
| 96 |
+
"degree", "master", "bachelor", "phd", "scholarship", "deadline", "application",
|
| 97 |
+
"budget", "under", "less than", "below", "compare", "recommend", "suggest",
|
| 98 |
+
"which", "what are the", "show me", "find me", "search for",
|
| 99 |
+
# Chinese keywords
|
| 100 |
+
"大学", "学费", "专业", "硕士", "学士", "博士", "申请", "要求", "奖学金",
|
| 101 |
+
# Malay keywords
|
| 102 |
+
"universiti", "yuran", "program", "ijazah", "syarat", "permohonan",
|
| 103 |
+
# Thai keywords
|
| 104 |
+
"มหาวิทยาลัย", "ค่าเล่าเรียน", "หลักสูตร", "ปริญญา", "เงื่อนไข",
|
| 105 |
+
# Indonesian keywords
|
| 106 |
+
"universitas", "biaya", "kuliah", "program", "sarjana", "persyaratan"
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
# Check for multiple criteria (indicates complex search)
|
| 110 |
+
criteria_count = 0
|
| 111 |
+
query_lower = query.lower()
|
| 112 |
+
|
| 113 |
+
for keyword in complex_keywords:
|
| 114 |
+
if keyword.lower() in query_lower:
|
| 115 |
+
criteria_count += 1
|
| 116 |
+
|
| 117 |
+
# Also check for comparison words, numbers, conditions
|
| 118 |
+
comparison_patterns = [
|
| 119 |
+
r"under \$?\d+", r"less than \$?\d+", r"below \$?\d+", r"between \$?\d+ and \$?\d+",
|
| 120 |
+
r"不超过.*元", r"低于.*元", r"少于.*元", # Chinese
|
| 121 |
+
r"kurang dari", r"di bawah", # Malay/Indonesian
|
| 122 |
+
r"น้อยกว่า", r"ต่ำกว่า" # Thai
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
for pattern in comparison_patterns:
|
| 126 |
+
if re.search(pattern, query_lower):
|
| 127 |
+
criteria_count += 2
|
| 128 |
+
|
| 129 |
+
# Complex query if multiple keywords or comparison patterns found
|
| 130 |
+
return criteria_count >= 2
|
| 131 |
+
|
| 132 |
+
def _is_translation_query(self, query: str) -> bool:
|
| 133 |
+
"""Check if query is primarily for translation"""
|
| 134 |
+
translation_keywords = [
|
| 135 |
+
"translate", "translation", "แปล", "翻译", "terjemah", "traduire"
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
query_lower = query.lower()
|
| 139 |
+
return any(keyword in query_lower for keyword in translation_keywords)
|
| 140 |
+
|
| 141 |
+
def generate_response(self, query: str, context: str = "", language: str = "English") -> str:
|
| 142 |
+
"""Generate response using appropriate SEA-LION model"""
|
| 143 |
+
|
| 144 |
+
# Choose model based on query complexity
|
| 145 |
+
if self._is_translation_query(query) or not self._is_complex_query(query):
|
| 146 |
+
model = self.instruct_model
|
| 147 |
+
use_reasoning = False
|
| 148 |
+
else:
|
| 149 |
+
model = self.reasoning_model
|
| 150 |
+
use_reasoning = True
|
| 151 |
+
|
| 152 |
+
# Prepare messages
|
| 153 |
+
system_prompt = f"""You are a helpful assistant specializing in ASEAN university admissions.
|
| 154 |
+
Respond in {language} unless specifically asked otherwise.
|
| 155 |
+
|
| 156 |
+
If provided with context from university documents, use that information to give accurate, specific answers.
|
| 157 |
+
Always cite your sources when using provided context.
|
| 158 |
+
|
| 159 |
+
For complex university search queries, provide:
|
| 160 |
+
1. Direct answers to the question
|
| 161 |
+
2. Relevant admission requirements
|
| 162 |
+
3. Tuition fees (if available)
|
| 163 |
+
4. Application deadlines (if available)
|
| 164 |
+
5. Source citations from the documents
|
| 165 |
+
|
| 166 |
+
Context: {context}"""
|
| 167 |
+
|
| 168 |
+
messages = [
|
| 169 |
+
{"role": "system", "content": system_prompt},
|
| 170 |
+
{"role": "user", "content": query}
|
| 171 |
+
]
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
if use_reasoning:
|
| 175 |
+
# Use reasoning model with thinking mode
|
| 176 |
+
response = self.client.chat.completions.create(
|
| 177 |
+
model=model,
|
| 178 |
+
messages=messages,
|
| 179 |
+
max_tokens=2000,
|
| 180 |
+
temperature=0.1,
|
| 181 |
+
extra_body={"thinking_mode": True}
|
| 182 |
+
)
|
| 183 |
+
else:
|
| 184 |
+
# Use instruct model for simpler queries
|
| 185 |
+
response = self.client.chat.completions.create(
|
| 186 |
+
model=model,
|
| 187 |
+
messages=messages,
|
| 188 |
+
max_tokens=1500,
|
| 189 |
+
temperature=0.3
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
# Strip out reasoning steps from the response
|
| 193 |
+
response_text = response.choices[0].message.content
|
| 194 |
+
if "</think>" in response_text:
|
| 195 |
+
response_text = response_text.split("</think>")[-1].strip()
|
| 196 |
+
|
| 197 |
+
return response_text
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"Error with SEA-LION model: {str(e)}")
|
| 201 |
+
return f"I apologize, but I encountered an error processing your query. Please try rephrasing your question. Error: {str(e)}"
|
| 202 |
+
|
| 203 |
+
def extract_metadata(self, document_text: str) -> Dict[str, str]:
|
| 204 |
+
"""Extract metadata from document text using LLM"""
|
| 205 |
+
|
| 206 |
+
system_prompt = """You are an expert at extracting metadata from university documents.
|
| 207 |
+
Analyze the provided document text and extract the following information:
|
| 208 |
+
|
| 209 |
+
1. University name (full official name)
|
| 210 |
+
2. Country (where the university is located)
|
| 211 |
+
3. Document type (choose from: admission_requirements, tuition_fees, program_information, scholarship_info, application_deadlines, general_info)
|
| 212 |
+
4. Language (choose from: English, Chinese, Malay, Thai, Indonesian, Vietnamese, Filipino)
|
| 213 |
+
|
| 214 |
+
Return your response as a JSON object with these exact keys:
|
| 215 |
+
{
|
| 216 |
+
"university_name": "extracted university name or \'Unknown\' if not found",
|
| 217 |
+
"country": "extracted country or \'Unknown\' if not found",
|
| 218 |
+
"document_type": "most appropriate document type from the list above",
|
| 219 |
+
"language": "detected language of the document"
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
Guidelines:
|
| 223 |
+
- For university_name: Look for official university names, avoid abbreviations when possible
|
| 224 |
+
- For country: Look for country names, city names that indicate country, or domain extensions
|
| 225 |
+
- For document_type: Analyze the content to determine what type of information it contains
|
| 226 |
+
- For language: Determine the primary language of the document.
|
| 227 |
+
- If information is unclear, use "Unknown" for university_name and country
|
| 228 |
+
- Always choose one of the specified document_type options and language options
|
| 229 |
+
"""
|
| 230 |
+
|
| 231 |
+
messages = [
|
| 232 |
+
{"role": "system", "content": system_prompt},
|
| 233 |
+
{"role": "user", "content": f"Extract metadata from this document text:\n\n{document_text}"}
|
| 234 |
+
]
|
| 235 |
+
|
| 236 |
+
try:
|
| 237 |
+
response = self.client.chat.completions.create(
|
| 238 |
+
model=self.instruct_model,
|
| 239 |
+
messages=messages,
|
| 240 |
+
max_tokens=500,
|
| 241 |
+
temperature=0.1
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
response_text = response.choices[0].message.content.strip()
|
| 245 |
+
print("--- DEBUG: LLM Metadata Extraction Details ---")
|
| 246 |
+
print(f"**Input Text for LLM (first 2 pages):**\n```\n{document_text[:1000]}...\n```") # Show first 1000 chars of input
|
| 247 |
+
print(f"**Raw LLM Response:**\n```json\n{response_text}\n```")
|
| 248 |
+
|
| 249 |
+
json_match = re.search(r'\{.*?\}', response_text, re.DOTALL)
|
| 250 |
+
if json_match:
|
| 251 |
+
json_str = json_match.group(0)
|
| 252 |
+
try:
|
| 253 |
+
metadata = json.loads(json_str)
|
| 254 |
+
print(f"**Parsed JSON Metadata:**\n```json\n{json.dumps(metadata, indent=2)}\n```")
|
| 255 |
+
required_keys = ["university_name", "country", "document_type", "language"]
|
| 256 |
+
if all(key in metadata for key in required_keys):
|
| 257 |
+
print("DEBUG: Successfully extracted and parsed metadata from LLM.")
|
| 258 |
+
return metadata
|
| 259 |
+
else:
|
| 260 |
+
print("DEBUG: LLM response missing required keys, attempting fallback or using defaults.")
|
| 261 |
+
return self._get_default_metadata()
|
| 262 |
+
except json.JSONDecodeError as e:
|
| 263 |
+
print(f"DEBUG: JSON Parsing Failed: {e}")
|
| 264 |
+
print(f"DEBUG: Attempting fallback text extraction from raw response.")
|
| 265 |
+
return self._extract_from_text_response(response_text)
|
| 266 |
+
else:
|
| 267 |
+
print("DEBUG: No JSON object found in LLM response.")
|
| 268 |
+
return self._extract_from_text_response(response_text)
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
print(f"DEBUG: Error during LLM Metadata Extraction: {str(e)}")
|
| 272 |
+
return self._get_default_metadata()
|
| 273 |
+
|
| 274 |
+
def _extract_from_text_response(self, response_text: str) -> Dict[str, str]:
|
| 275 |
+
"""Fallback method to extract metadata from non-JSON LLM response"""
|
| 276 |
+
metadata = self._get_default_metadata()
|
| 277 |
+
lines = response_text.split("\n")
|
| 278 |
+
for line in lines:
|
| 279 |
+
line = line.strip()
|
| 280 |
+
if "university" in line.lower() and ":" in line:
|
| 281 |
+
value = line.split(":", 1)[1].strip().strip('",')
|
| 282 |
+
metadata["university_name"] = value
|
| 283 |
+
elif "country" in line.lower() and ":" in line:
|
| 284 |
+
value = line.split(":", 1)[1].strip().strip('",')
|
| 285 |
+
metadata["country"] = value
|
| 286 |
+
elif "document_type" in line.lower() and ":" in line:
|
| 287 |
+
value = line.split(":", 1)[1].strip().strip('",')
|
| 288 |
+
metadata["document_type"] = value
|
| 289 |
+
elif "language" in line.lower() and ":" in line:
|
| 290 |
+
value = line.split(":", 1)[1].strip().strip('",')
|
| 291 |
+
metadata["language"] = value
|
| 292 |
+
print(f"DEBUG: Fallback text extraction result: {metadata}")
|
| 293 |
+
return metadata
|
| 294 |
+
|
| 295 |
+
def _get_default_metadata(self) -> Dict[str, str]:
|
| 296 |
+
"""Return default metadata when extraction fails"""
|
| 297 |
+
return {
|
| 298 |
+
"university_name": "Unknown",
|
| 299 |
+
"country": "Unknown",
|
| 300 |
+
"document_type": "general_info",
|
| 301 |
+
"language": "Unknown"
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
def classify_query_type(query: str) -> str:
|
| 305 |
+
"""Public function to classify query type for UI display"""
|
| 306 |
+
# Create a temporary SEALionLLM instance just for classification
|
| 307 |
+
temp_llm = SEALionLLM()
|
| 308 |
+
|
| 309 |
+
if temp_llm._is_translation_query(query) or not temp_llm._is_complex_query(query):
|
| 310 |
+
return "simple"
|
| 311 |
+
else:
|
| 312 |
+
return "complex"
|
| 313 |
+
|
| 314 |
+
class DocumentIngestion:
|
| 315 |
+
def __init__(self):
|
| 316 |
+
# Initialize SEA-LION LLM for metadata extraction
|
| 317 |
+
self.sea_lion_llm = SEALionLLM()
|
| 318 |
+
|
| 319 |
+
# Use BGE embeddings by default for better performance
|
| 320 |
+
try:
|
| 321 |
+
self.embeddings = AlternativeEmbeddings()
|
| 322 |
+
self.embedding_type = "BGE-small-en"
|
| 323 |
+
if not self.embeddings.model:
|
| 324 |
+
raise Exception("BGE model not available")
|
| 325 |
+
except Exception:
|
| 326 |
+
# Fallback to OpenAI if BGE not available
|
| 327 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
| 328 |
+
if openai_key and openai_key != "placeholder_for_embeddings" and openai_key != "your_openai_api_key_here":
|
| 329 |
+
try:
|
| 330 |
+
self.embeddings = OpenAIEmbeddings()
|
| 331 |
+
self.embedding_type = "OpenAI"
|
| 332 |
+
except Exception as e:
|
| 333 |
+
print("Both BGE and OpenAI embeddings failed. Please check your setup.")
|
| 334 |
+
raise e
|
| 335 |
+
else:
|
| 336 |
+
print("No embedding model available. Please install sentence-transformers or provide OpenAI API key.")
|
| 337 |
+
raise Exception("No embedding model available")
|
| 338 |
+
|
| 339 |
+
self.text_splitter = SemanticChunker(
|
| 340 |
+
embeddings_model=self.embeddings,
|
| 341 |
+
chunk_size=4, # 4 sentences per base chunk
|
| 342 |
+
overlap=1, # 1 sentence overlap
|
| 343 |
+
similarity_threshold=0.75, # Semantic similarity threshold
|
| 344 |
+
min_chunk_size=150, # Minimum 150 characters
|
| 345 |
+
max_chunk_size=1500, # Maximum 1500 characters
|
| 346 |
+
debug=True # Show statistics in Streamlit
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
# st.info(f"🧠 Using semantic chunking with {self.embedding_type} embeddings") # Commented out as it\'s a Streamlit call
|
| 350 |
+
self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
|
| 351 |
+
os.makedirs(self.persist_directory, exist_ok=True)
|
| 352 |
+
|
| 353 |
+
def extract_text_from_pdf(self, pdf_file_path) -> List[str]:
|
| 354 |
+
"""Extract text from PDF file path with multiple fallback methods."""
|
| 355 |
+
try:
|
| 356 |
+
# Method 1: Try with PyPDF2 (handles most PDFs including encrypted ones with PyCryptodome)
|
| 357 |
+
with open(pdf_file_path, 'rb') as pdf_file:
|
| 358 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 359 |
+
|
| 360 |
+
# Check if PDF is encrypted
|
| 361 |
+
if pdf_reader.is_encrypted:
|
| 362 |
+
# Try to decrypt with empty password (common for protected but not password-protected PDFs)
|
| 363 |
+
try:
|
| 364 |
+
pdf_reader.decrypt("")
|
| 365 |
+
except Exception:
|
| 366 |
+
print(f"PDF {os.path.basename(pdf_file_path)} is password-protected. Please provide an unprotected version.")
|
| 367 |
+
return [] # Return empty list for password-protected PDFs
|
| 368 |
+
|
| 369 |
+
text_per_page = []
|
| 370 |
+
for page_num, page in enumerate(pdf_reader.pages):
|
| 371 |
+
try:
|
| 372 |
+
page_text = page.extract_text()
|
| 373 |
+
text_per_page.append(page_text)
|
| 374 |
+
except Exception as e:
|
| 375 |
+
print(f"Could not extract text from page {page_num + 1} of {os.path.basename(pdf_file_path)}: {str(e)}")
|
| 376 |
+
text_per_page.append("") # Append empty string for failed pages
|
| 377 |
+
|
| 378 |
+
if any(text.strip() for text in text_per_page):
|
| 379 |
+
return text_per_page
|
| 380 |
+
else:
|
| 381 |
+
print(f"No extractable text found in {os.path.basename(pdf_file_path)}. This might be a scanned PDF or image-based document.")
|
| 382 |
+
return []
|
| 383 |
+
|
| 384 |
+
except Exception as e:
|
| 385 |
+
error_msg = str(e)
|
| 386 |
+
if "PyCryptodome" in error_msg:
|
| 387 |
+
print(f"Encryption error with {os.path.basename(pdf_file_path)}: {error_msg}")
|
| 388 |
+
print("💡 The PDF uses encryption. PyCryptodome has been installed to handle this.")
|
| 389 |
+
elif "password" in error_msg.lower():
|
| 390 |
+
print(f"Password-protected PDF: {os.path.basename(pdf_file_path)}")
|
| 391 |
+
print("💡 Please provide an unprotected version of this PDF.")
|
| 392 |
+
else:
|
| 393 |
+
print(f"Error extracting text from {os.path.basename(pdf_file_path)}: {error_msg}")
|
| 394 |
+
return []
|
| 395 |
+
|
| 396 |
+
def process_documents(self, pdf_file_paths) -> List[Document]:
|
| 397 |
+
"""Process PDF file paths and convert to documents with automatic metadata extraction."""
|
| 398 |
+
documents = []
|
| 399 |
+
processed_count = 0
|
| 400 |
+
failed_count = 0
|
| 401 |
+
|
| 402 |
+
print(f"📄 Processing {len(pdf_file_paths)} document(s) with automatic metadata detection...") # Changed to print
|
| 403 |
+
|
| 404 |
+
for pdf_file_path in pdf_file_paths:
|
| 405 |
+
if pdf_file_path.endswith('.pdf'):
|
| 406 |
+
filename = os.path.basename(pdf_file_path)
|
| 407 |
+
print(f"🔍 Extracting text from: **{filename}**") # Changed to print
|
| 408 |
+
|
| 409 |
+
# Extract text per page
|
| 410 |
+
text_per_page = self.extract_text_from_pdf(pdf_file_path)
|
| 411 |
+
print(f"DEBUG: Extracted {len(text_per_page)} pages from {filename}")
|
| 412 |
+
|
| 413 |
+
if text_per_page:
|
| 414 |
+
# Combine first two pages for metadata extraction
|
| 415 |
+
text_for_metadata = "\n".join(text_per_page[:2])
|
| 416 |
+
print(f"DEBUG: Text for metadata extraction (first 500 chars): {text_for_metadata[:500]}")
|
| 417 |
+
# Extract metadata using LLM
|
| 418 |
+
print(f"🤖 Detecting metadata for: **{filename}**") # Changed to print
|
| 419 |
+
extracted_metadata = self.sea_lion_llm.extract_metadata(text_for_metadata)
|
| 420 |
+
|
| 421 |
+
# Create metadata
|
| 422 |
+
metadata = {
|
| 423 |
+
"source": filename,
|
| 424 |
+
"university": extracted_metadata.get("university_name", "Unknown"),
|
| 425 |
+
"country": extracted_metadata.get("country", "Unknown"),
|
| 426 |
+
"document_type": extracted_metadata.get("document_type", "general_info"),
|
| 427 |
+
"language": extracted_metadata.get("language", "Unknown"), # Added language
|
| 428 |
+
"upload_timestamp": datetime.now().isoformat(),
|
| 429 |
+
"file_id": str(uuid.uuid4())
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
# Create document
|
| 433 |
+
doc = Document(
|
| 434 |
+
page_content="\n".join(text_per_page), # Use all pages for document content
|
| 435 |
+
metadata=metadata
|
| 436 |
+
)
|
| 437 |
+
documents.append(doc)
|
| 438 |
+
processed_count += 1
|
| 439 |
+
print(f"✅ Successfully processed: **{filename}** ({len(doc.page_content)} characters)") # Changed to print
|
| 440 |
+
else:
|
| 441 |
+
failed_count += 1
|
| 442 |
+
print(f"⚠️ Could not extract text from **{filename}**") # Changed to print
|
| 443 |
+
else:
|
| 444 |
+
failed_count += 1
|
| 445 |
+
filename = os.path.basename(pdf_file_path)
|
| 446 |
+
print(f"❌ Unsupported file type for {filename} (expected .pdf)") # Changed to print
|
| 447 |
+
|
| 448 |
+
# Summary
|
| 449 |
+
if processed_count > 0:
|
| 450 |
+
print(f"🎉 Successfully processed **{processed_count}** document(s)") # Changed to print
|
| 451 |
+
if failed_count > 0:
|
| 452 |
+
print(f"⚠️ Failed to process **{failed_count}** document(s)") # Changed to print
|
| 453 |
+
|
| 454 |
+
return documents
|
| 455 |
+
|
| 456 |
+
def create_vector_store(self, documents: List[Document]) -> Chroma:
|
| 457 |
+
"""Create and persist vector store from documents."""
|
| 458 |
+
if not documents:
|
| 459 |
+
print("No documents to process") # Changed to print
|
| 460 |
+
return None
|
| 461 |
+
|
| 462 |
+
# Split documents into chunks
|
| 463 |
+
texts = self.text_splitter.split_documents(documents)
|
| 464 |
+
|
| 465 |
+
# Create vector store
|
| 466 |
+
vectorstore = Chroma.from_documents(
|
| 467 |
+
documents=texts,
|
| 468 |
+
embedding=self.embeddings,
|
| 469 |
+
persist_directory=self.persist_directory
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
return vectorstore
|
| 473 |
+
|
| 474 |
+
def load_existing_vectorstore(self) -> Optional[Chroma]:
|
| 475 |
+
"""Load existing vector store if it exists."""
|
| 476 |
+
try:
|
| 477 |
+
vectorstore = Chroma(
|
| 478 |
+
persist_directory=self.persist_directory,
|
| 479 |
+
embedding_function=self.embeddings
|
| 480 |
+
)
|
| 481 |
+
return vectorstore
|
| 482 |
+
except Exception as e:
|
| 483 |
+
print(f"Could not load existing vector store: {str(e)}") # Changed to print
|
| 484 |
+
return None
|
| 485 |
+
|
| 486 |
+
class RAGSystem:
|
| 487 |
+
def __init__(self):
|
| 488 |
+
# Initialize embeddings - try BGE first, fallback to OpenAI
|
| 489 |
+
try:
|
| 490 |
+
self.embeddings = AlternativeEmbeddings()
|
| 491 |
+
if not self.embeddings.model:
|
| 492 |
+
# Fallback to OpenAI if BGE not available
|
| 493 |
+
self.embeddings = OpenAIEmbeddings()
|
| 494 |
+
except Exception:
|
| 495 |
+
# If both fail, use OpenAI as last resort
|
| 496 |
+
self.embeddings = OpenAIEmbeddings()
|
| 497 |
+
|
| 498 |
+
self.sea_lion_llm = SEALionLLM()
|
| 499 |
+
self.persist_directory = os.getenv("CHROMA_PERSIST_DIRECTORY", "./chroma_db")
|
| 500 |
+
|
| 501 |
+
def get_vectorstore(self) -> Optional[Chroma]:
|
| 502 |
+
"""Get the vector store."""
|
| 503 |
+
try:
|
| 504 |
+
vectorstore = Chroma(
|
| 505 |
+
persist_directory=self.persist_directory,
|
| 506 |
+
embedding_function=self.embeddings
|
| 507 |
+
)
|
| 508 |
+
return vectorstore
|
| 509 |
+
except Exception as e:
|
| 510 |
+
print(f"Error loading vector store: {str(e)}")
|
| 511 |
+
return None
|
| 512 |
+
|
| 513 |
+
def query(self, question: str, language: str = "English") -> Dict[str, Any]:
|
| 514 |
+
"""Query the RAG system using SEA-LION models."""
|
| 515 |
+
vectorstore = self.get_vectorstore()
|
| 516 |
+
# if not vectorstore:
|
| 517 |
+
# return {
|
| 518 |
+
# "answer": "No documents have been ingested yet. Please upload some PDF documents first.",
|
| 519 |
+
# "source_documents": [],
|
| 520 |
+
# "query_id": None
|
| 521 |
+
# }
|
| 522 |
+
|
| 523 |
+
try:
|
| 524 |
+
# Retrieve relevant documents
|
| 525 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
|
| 526 |
+
relevant_docs = retriever.get_relevant_documents(question)
|
| 527 |
+
|
| 528 |
+
# Prepare context from retrieved documents
|
| 529 |
+
context_parts = []
|
| 530 |
+
for i, doc in enumerate(relevant_docs, 1):
|
| 531 |
+
source_info = doc.metadata.get('source', 'Unknown')
|
| 532 |
+
university = doc.metadata.get('university', 'Unknown')
|
| 533 |
+
country = doc.metadata.get('country', 'Unknown')
|
| 534 |
+
|
| 535 |
+
context_parts.append(f"""
|
| 536 |
+
Document {i} (Source: {source_info}, University: {university}, Country: {country}):
|
| 537 |
+
{doc.page_content[:500]}...
|
| 538 |
+
""")
|
| 539 |
+
|
| 540 |
+
context = "\n".join(context_parts)
|
| 541 |
+
|
| 542 |
+
# Generate response using SEA-LION model
|
| 543 |
+
answer = self.sea_lion_llm.generate_response(
|
| 544 |
+
query=question,
|
| 545 |
+
context=context,
|
| 546 |
+
language=language
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
# Generate query ID for sharing
|
| 550 |
+
query_id = str(uuid.uuid4())
|
| 551 |
+
|
| 552 |
+
return {
|
| 553 |
+
"answer": answer,
|
| 554 |
+
"source_documents": relevant_docs,
|
| 555 |
+
"query_id": query_id,
|
| 556 |
+
"original_question": question,
|
| 557 |
+
"language": language,
|
| 558 |
+
"model_used": "SEA-LION" + (" Reasoning" if self.sea_lion_llm._is_complex_query(question) else " Instruct")
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
except Exception as e:
|
| 562 |
+
print(f"Error querying system: {str(e)}")
|
| 563 |
+
return {
|
| 564 |
+
"answer": f"Error processing your question: {str(e)}",
|
| 565 |
+
"source_documents": [],
|
| 566 |
+
"query_id": None
|
| 567 |
+
}
|
| 568 |
+
|
| 569 |
+
def save_query_result(query_result: Dict[str, Any]):
|
| 570 |
+
"""Save query result for sharing."""
|
| 571 |
+
if query_result.get("query_id"):
|
| 572 |
+
results_dir = "query_results"
|
| 573 |
+
os.makedirs(results_dir, exist_ok=True)
|
| 574 |
+
|
| 575 |
+
result_file = f"{results_dir}/{query_result['query_id']}.json"
|
| 576 |
+
|
| 577 |
+
# Prepare data for saving (remove non-serializable objects)
|
| 578 |
+
save_data = {
|
| 579 |
+
"query_id": query_result["query_id"],
|
| 580 |
+
"question": query_result.get("original_question", ""),
|
| 581 |
+
"answer": query_result["answer"],
|
| 582 |
+
"language": query_result.get("language", "English"),
|
| 583 |
+
"timestamp": datetime.now().isoformat(),
|
| 584 |
+
"sources": [
|
| 585 |
+
{
|
| 586 |
+
"source": doc.metadata.get("source", "Unknown"),
|
| 587 |
+
"university": doc.metadata.get("university", "Unknown"),
|
| 588 |
+
"country": doc.metadata.get("country", "Unknown"),
|
| 589 |
+
"content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
|
| 590 |
+
}
|
| 591 |
+
for doc in query_result.get("source_documents", [])
|
| 592 |
+
]
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
try:
|
| 596 |
+
with open(result_file, 'w', encoding='utf-8') as f:
|
| 597 |
+
json.dump(save_data, f, indent=2, ensure_ascii=False)
|
| 598 |
+
return True
|
| 599 |
+
except Exception as e:
|
| 600 |
+
print(f"Error saving query result: {str(e)}")
|
| 601 |
+
return False
|
| 602 |
+
return False
|
| 603 |
+
|
| 604 |
+
def load_shared_query(query_id: str) -> Optional[Dict[str, Any]]:
|
| 605 |
+
"""Load a shared query result."""
|
| 606 |
+
result_file = f"query_results/{query_id}.json"
|
| 607 |
+
|
| 608 |
+
if os.path.exists(result_file):
|
| 609 |
+
try:
|
| 610 |
+
with open(result_file, 'r', encoding='utf-8') as f:
|
| 611 |
+
return json.load(f)
|
| 612 |
+
except Exception as e:
|
| 613 |
+
print(f"Error loading shared query: {str(e)}")
|
| 614 |
+
|
| 615 |
+
return None
|
utils/translations.py
ADDED
|
@@ -0,0 +1,753 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
translations = {
|
| 2 |
+
"English": {
|
| 3 |
+
# Navigation
|
| 4 |
+
"search_universities": "🔍 Search Universities",
|
| 5 |
+
"upload_documents": "📄 Upload Documents",
|
| 6 |
+
"manage_documents": "🗂 Manage Documents",
|
| 7 |
+
"about": "ℹ️ About Top.Edu",
|
| 8 |
+
"navigation": "🎯 Navigation",
|
| 9 |
+
|
| 10 |
+
# Main header
|
| 11 |
+
"app_title": "🎓 Top.Edu",
|
| 12 |
+
"app_subtitle": "Unlock ASEAN Education with AI-Powered Search",
|
| 13 |
+
|
| 14 |
+
# Search page
|
| 15 |
+
"search_header": "🔍 Search University Information",
|
| 16 |
+
"search_description": "Ask about admissions, fees, scholarships, and programs:",
|
| 17 |
+
"language_label": "Response Language",
|
| 18 |
+
"your_question": "Your question:",
|
| 19 |
+
"placeholder_text": "e.g., Master's in Malaysia under 40,000 RMB/year",
|
| 20 |
+
"example_queries": "💡 See Example Queries",
|
| 21 |
+
"complex_queries": "🧠 Complex Queries (Uses Reasoning Model)",
|
| 22 |
+
"simple_queries": "⚡ Simple Queries (Uses Instruct Model)",
|
| 23 |
+
"advanced_filters": "🔧 Advanced Filters (Optional)",
|
| 24 |
+
"budget_range": "Budget Range (Local Currency/Year)",
|
| 25 |
+
"study_level": "Study Level",
|
| 26 |
+
"preferred_countries": "Preferred Countries",
|
| 27 |
+
"search_button": "🔍 Search",
|
| 28 |
+
"ready_to_search": "✅ Ready to search! Click the search button when you're ready.",
|
| 29 |
+
"enter_question": "💭 Enter your question in the text box above to start searching.",
|
| 30 |
+
"using_example": "📝 Using example:",
|
| 31 |
+
"responses_in": "🌐 Responses will be in",
|
| 32 |
+
|
| 33 |
+
# Upload page
|
| 34 |
+
"upload_header": "📄 Upload University Documents",
|
| 35 |
+
"upload_description": "Upload official PDF documents containing university admission requirements, fees, and program information.",
|
| 36 |
+
"university_name": "University Name",
|
| 37 |
+
"country": "Country",
|
| 38 |
+
"document_type": "Document Type",
|
| 39 |
+
"choose_files": "Choose PDF files",
|
| 40 |
+
"drag_drop": "Drag and drop files here",
|
| 41 |
+
"file_limit": "Limit 200MB per file • PDF",
|
| 42 |
+
"browse_files": "Browse files",
|
| 43 |
+
"process_documents": "🚀 Process Documents",
|
| 44 |
+
"processing_docs": "📄 Processing document(s)...",
|
| 45 |
+
"successfully_processed": "🎉 Successfully processed",
|
| 46 |
+
"failed_to_process": "⚠️ Failed to process",
|
| 47 |
+
"documents": "document(s)",
|
| 48 |
+
"no_docs_processed": "No documents were successfully processed.",
|
| 49 |
+
|
| 50 |
+
# Document types
|
| 51 |
+
"admission_requirements": "Admission Requirements",
|
| 52 |
+
"tuition_fees": "Tuition Fees & Costs",
|
| 53 |
+
"program_information": "Program Information",
|
| 54 |
+
"scholarship_info": "Scholarship Information",
|
| 55 |
+
"application_deadlines": "Application Deadlines",
|
| 56 |
+
"general_info": "General Information",
|
| 57 |
+
|
| 58 |
+
# Manage documents page
|
| 59 |
+
"manage_header": "🗂 Manage Documents",
|
| 60 |
+
"manage_description": "View and manage uploaded university documents in your knowledge base.",
|
| 61 |
+
"total_documents": "Total Documents",
|
| 62 |
+
"total_chunks": "Total Text Chunks",
|
| 63 |
+
"storage_size": "Storage Size",
|
| 64 |
+
"last_updated": "Last Updated",
|
| 65 |
+
"document_list": "📚 Document List",
|
| 66 |
+
"no_documents": "No documents found. Upload some documents first!",
|
| 67 |
+
"delete_all": "🗑️ Delete All Documents",
|
| 68 |
+
"confirm_delete": "⚠️ Are you sure you want to delete ALL documents? This cannot be undone.",
|
| 69 |
+
"yes_delete": "Yes, Delete All",
|
| 70 |
+
"documents_deleted": "All documents have been deleted.",
|
| 71 |
+
|
| 72 |
+
# About page
|
| 73 |
+
"about_header": "About Top.Edu",
|
| 74 |
+
"what_we_do": "🎯 What We Do",
|
| 75 |
+
"what_we_do_description": "Top.Edu helps students worldwide easily find accurate and up-to-date information on universities in Southeast Asia. Our platform aggregates official university documents and uses AI to answer questions about programs, tuition, entry requirements, and application deadlines.",
|
| 76 |
+
"who_we_are": "💡 Who We Are",
|
| 77 |
+
"who_we_are_description": "We are a team of education and AI enthusiasts dedicated to making higher education accessible and transparent. Our mission is to simplify the search process and provide trustworthy guidance to students looking to study abroad.",
|
| 78 |
+
"contact": "📞 Contact & Support",
|
| 79 |
+
"supported_languages": "🌏 Supported Languages",
|
| 80 |
+
|
| 81 |
+
# Countries
|
| 82 |
+
"singapore": "Singapore",
|
| 83 |
+
"malaysia": "Malaysia",
|
| 84 |
+
"thailand": "Thailand",
|
| 85 |
+
"indonesia": "Indonesia",
|
| 86 |
+
"philippines": "Philippines",
|
| 87 |
+
"vietnam": "Vietnam",
|
| 88 |
+
"brunei": "Brunei",
|
| 89 |
+
|
| 90 |
+
# Study levels
|
| 91 |
+
"diploma": "Diploma",
|
| 92 |
+
"bachelor": "Bachelor",
|
| 93 |
+
"master": "Master",
|
| 94 |
+
"phd": "PhD",
|
| 95 |
+
|
| 96 |
+
# Budget options
|
| 97 |
+
"any": "Any",
|
| 98 |
+
"under_10k": "<10k",
|
| 99 |
+
"10k_20k": "10k-20k",
|
| 100 |
+
"20k_30k": "20k-30k",
|
| 101 |
+
"30k_40k": "30k-40k",
|
| 102 |
+
"over_40k": ">40k",
|
| 103 |
+
|
| 104 |
+
# Example queries
|
| 105 |
+
"example_complex_1": "Show me universities in Malaysia for master's degrees with tuition under 40,000 RMB per year",
|
| 106 |
+
"example_complex_2": "Compare engineering programs in Thailand and Singapore under $15,000 per year",
|
| 107 |
+
"example_complex_3": "Find MBA programs in ASEAN with GMAT requirements and scholarships available",
|
| 108 |
+
"example_complex_4": "Universities in Indonesia with English-taught programs and no IELTS requirement",
|
| 109 |
+
"example_simple_1": "What does IELTS stand for?",
|
| 110 |
+
"example_simple_2": "What is the difference between bachelor and master degree?",
|
| 111 |
+
"example_simple_3": "How to apply for student visa?",
|
| 112 |
+
"example_simple_4": "What documents are needed for university application?",
|
| 113 |
+
|
| 114 |
+
# System messages
|
| 115 |
+
"systems_initialized": "✅ Systems initialized successfully!",
|
| 116 |
+
"can_upload_documents": "You can now upload documents.",
|
| 117 |
+
"initialization_error": "Error initializing systems",
|
| 118 |
+
"installation_help": """**Possible solutions:**
|
| 119 |
+
1. Install sentence-transformers: `pip install sentence-transformers`
|
| 120 |
+
2. Or provide OpenAI API key in environment variables
|
| 121 |
+
3. Check that PyTorch is properly installed
|
| 122 |
+
|
| 123 |
+
**For deployment:**
|
| 124 |
+
- Ensure requirements.txt includes: sentence-transformers, torch, transformers""",
|
| 125 |
+
"please_initialize_first": "Please initialize systems first using the 'Initialize System' tab!",
|
| 126 |
+
"please_upload_pdf": "Please upload at least one PDF file.",
|
| 127 |
+
"upload_pdf_only": "Please upload PDF files only.",
|
| 128 |
+
"successfully_processed_docs": "Successfully processed",
|
| 129 |
+
"failed_create_vectorstore": "Failed to create vector store from documents.",
|
| 130 |
+
"no_docs_successfully_processed": "No documents were successfully processed. Please check if your PDFs are readable.",
|
| 131 |
+
"error_processing_docs": "Error processing documents",
|
| 132 |
+
"check_console": "Please check the console for more details.",
|
| 133 |
+
"please_upload_process_first": "Please upload and process documents first using the 'Upload Documents' tab!",
|
| 134 |
+
"please_enter_question": "Please enter a question.",
|
| 135 |
+
"processing_query": "Processing query",
|
| 136 |
+
"model_used": "Model Used",
|
| 137 |
+
"answer": "Answer",
|
| 138 |
+
"sources": "Sources",
|
| 139 |
+
"no_sources_found": "No specific sources found. This might be a general response.",
|
| 140 |
+
"error_querying_docs": "Error querying documents",
|
| 141 |
+
"ready_for_queries": "Ready for queries! Go to the 'Search & Query' tab to start asking questions.",
|
| 142 |
+
|
| 143 |
+
# Interface elements
|
| 144 |
+
"initialize_system": "Initialize System",
|
| 145 |
+
"initialize_systems": "Initialize Systems",
|
| 146 |
+
"initialization_status": "Initialization Status",
|
| 147 |
+
},
|
| 148 |
+
|
| 149 |
+
"中文": {
|
| 150 |
+
# Navigation
|
| 151 |
+
"search_universities": "🔍 搜索大学",
|
| 152 |
+
"upload_documents": "📄 上传文档",
|
| 153 |
+
"manage_documents": "🗂 管理文档",
|
| 154 |
+
"about": "ℹ️ 关于Top.Edu",
|
| 155 |
+
"navigation": "🎯 导航",
|
| 156 |
+
|
| 157 |
+
# Main header
|
| 158 |
+
"app_title": "🎓 Top.Edu",
|
| 159 |
+
"app_subtitle": "用AI驱动的搜索解锁东盟教育机会",
|
| 160 |
+
|
| 161 |
+
# Search page
|
| 162 |
+
"search_header": "🔍 搜索大学信息",
|
| 163 |
+
"search_description": "询问关于入学要求、学费、奖学金和专业项目:",
|
| 164 |
+
"language_label": "回复语言",
|
| 165 |
+
"your_question": "您的问题:",
|
| 166 |
+
"placeholder_text": "例如:马来西亚硕士学位,学费低于4万人民币/年",
|
| 167 |
+
"example_queries": "💡 查看示例问题",
|
| 168 |
+
"complex_queries": "🧠 复杂查询(使用推理模型)",
|
| 169 |
+
"simple_queries": "⚡ 简单查询(使用指令模型)",
|
| 170 |
+
"advanced_filters": "🔧 高级筛选(可选)",
|
| 171 |
+
"budget_range": "预算范围(当地货币/年)",
|
| 172 |
+
"study_level": "学历层次",
|
| 173 |
+
"preferred_countries": "首选国家",
|
| 174 |
+
"search_button": "🔍 搜索",
|
| 175 |
+
"ready_to_search": "✅ 准备搜索!准备好后点击搜索按钮。",
|
| 176 |
+
"enter_question": "💭 在上面的文本框中输入您的问题开始搜索。",
|
| 177 |
+
"using_example": "📝 使用示例:",
|
| 178 |
+
"responses_in": "🌐 回复将使用",
|
| 179 |
+
|
| 180 |
+
# Upload page
|
| 181 |
+
"upload_header": "📄 上传大学文档",
|
| 182 |
+
"upload_description": "上传包含大学入学要求、学费和专业信息的官方PDF文档。",
|
| 183 |
+
"university_name": "大学名称",
|
| 184 |
+
"country": "国家",
|
| 185 |
+
"document_type": "文档类型",
|
| 186 |
+
"choose_files": "选择PDF文件",
|
| 187 |
+
"drag_drop": "将文件拖放到此处",
|
| 188 |
+
"file_limit": "限制每个文件200MB • PDF",
|
| 189 |
+
"browse_files": "浏览文件",
|
| 190 |
+
"process_documents": "🚀 处理文档",
|
| 191 |
+
"processing_docs": "📄 正在处理文档...",
|
| 192 |
+
"successfully_processed": "🎉 成功处理",
|
| 193 |
+
"failed_to_process": "⚠️ 处理失败",
|
| 194 |
+
"documents": "个文档",
|
| 195 |
+
"no_docs_processed": "没有成���处理任何文档。",
|
| 196 |
+
|
| 197 |
+
# Document types
|
| 198 |
+
"admission_requirements": "入学要求",
|
| 199 |
+
"tuition_fees": "学费和费用",
|
| 200 |
+
"program_information": "专业信息",
|
| 201 |
+
"scholarship_info": "奖学金信息",
|
| 202 |
+
"application_deadlines": "申请截止日期",
|
| 203 |
+
"general_info": "一般信息",
|
| 204 |
+
|
| 205 |
+
# Manage documents page
|
| 206 |
+
"manage_header": "🗂 管理文档",
|
| 207 |
+
"manage_description": "查看和管理您知识库中上传的大学文档。",
|
| 208 |
+
"total_documents": "文档总数",
|
| 209 |
+
"total_chunks": "文本块总数",
|
| 210 |
+
"storage_size": "存储大小",
|
| 211 |
+
"last_updated": "最后更新",
|
| 212 |
+
"document_list": "📚 文档列表",
|
| 213 |
+
"no_documents": "未找到文档。请先上传一些文档!",
|
| 214 |
+
"delete_all": "🗑️ 删除所有文档",
|
| 215 |
+
"confirm_delete": "⚠️ 您确定要删除所有文档吗?此操作无法撤消。",
|
| 216 |
+
"yes_delete": "是的,删除全部",
|
| 217 |
+
"documents_deleted": "所有文档已被删除。",
|
| 218 |
+
|
| 219 |
+
# About page
|
| 220 |
+
"about_header": "关于 Top.Edu",
|
| 221 |
+
"what_we_do": "🎯 我们的工作",
|
| 222 |
+
"what_we_do_description": "Top.Edu 帮助全球学生轻松获取东南亚高校的准确且最新的信息。我们的平台整合官方大学文件,并利用 AI 回答有关课程、学费、入学要求和申请截止日期的问题。",
|
| 223 |
+
"who_we_are": "💡 我们是谁",
|
| 224 |
+
"who_we_are_description": "我们是一支热衷教育与 AI 的团队,致力于让高等教育变得更透明可及。我们的使命是简化搜索流程,为希望出国留学的学生提供可靠指导。",
|
| 225 |
+
"contact": "📞 联系与支持",
|
| 226 |
+
"supported_languages": "🌏 支持语言",
|
| 227 |
+
|
| 228 |
+
# Countries
|
| 229 |
+
"singapore": "新加坡",
|
| 230 |
+
"malaysia": "马来西亚",
|
| 231 |
+
"thailand": "泰国",
|
| 232 |
+
"indonesia": "印度尼西亚",
|
| 233 |
+
"philippines": "菲律宾",
|
| 234 |
+
"vietnam": "越南",
|
| 235 |
+
"brunei": "文莱",
|
| 236 |
+
|
| 237 |
+
# Study levels
|
| 238 |
+
"diploma": "文凭",
|
| 239 |
+
"bachelor": "学士",
|
| 240 |
+
"master": "硕士",
|
| 241 |
+
"phd": "博士",
|
| 242 |
+
|
| 243 |
+
# Budget options
|
| 244 |
+
"any": "任意",
|
| 245 |
+
"under_10k": "<1万",
|
| 246 |
+
"10k_20k": "1-2万",
|
| 247 |
+
"20k_30k": "2-3万",
|
| 248 |
+
"30k_40k": "3-4万",
|
| 249 |
+
"over_40k": ">4万",
|
| 250 |
+
|
| 251 |
+
# Example queries
|
| 252 |
+
"example_complex_1": "为我推荐马来西亚学费低于4万人民币/年的硕士学位项目",
|
| 253 |
+
"example_complex_2": "比较泰国和新加坡学费低于1.5万美元/年的工程专业",
|
| 254 |
+
"example_complex_3": "寻找东盟地区有GMAT要求和奖学金的MBA项目",
|
| 255 |
+
"example_complex_4": "印尼有英语授课且无需雅思的大学项目",
|
| 256 |
+
"example_simple_1": "IELTS是什么意思?",
|
| 257 |
+
"example_simple_2": "学士学位和硕士学位有什么区别?",
|
| 258 |
+
"example_simple_3": "如何申请学生签证?",
|
| 259 |
+
"example_simple_4": "大学申请需要哪些文件?",
|
| 260 |
+
|
| 261 |
+
# System messages
|
| 262 |
+
"systems_initialized": "✅ 系统初始化成功!",
|
| 263 |
+
"can_upload_documents": "您现在可以上传文档。",
|
| 264 |
+
"initialization_error": "系统初始化错误",
|
| 265 |
+
"installation_help": """**可能的解决方案:**
|
| 266 |
+
1. 安装 sentence-transformers: `pip install sentence-transformers`
|
| 267 |
+
2. 或在环境变量中提供 OpenAI API 密钥
|
| 268 |
+
3. 检查 PyTorch 是否正确安装
|
| 269 |
+
|
| 270 |
+
**部署时:**
|
| 271 |
+
- 确保 requirements.txt 包含:sentence-transformers, torch, transformers""",
|
| 272 |
+
"please_initialize_first": "请先使用'初始化系统'选项卡初始化系统!",
|
| 273 |
+
"please_upload_pdf": "请至少上传一个PDF文件。",
|
| 274 |
+
"upload_pdf_only": "请仅上传PDF文件。",
|
| 275 |
+
"successfully_processed_docs": "成功处理",
|
| 276 |
+
"failed_create_vectorstore": "创建向量存储失败。",
|
| 277 |
+
"no_docs_successfully_processed": "没有成功处理任何文档。请检查您的PDF是否可读。",
|
| 278 |
+
"error_processing_docs": "处理文档时出错",
|
| 279 |
+
"check_console": "请查看控制台获取更多详细信息。",
|
| 280 |
+
"please_upload_process_first": "请先使用'上传文档'选项卡上传和处理文档!",
|
| 281 |
+
"please_enter_question": "请输入问题。",
|
| 282 |
+
"processing_query": "正在处理查询",
|
| 283 |
+
"model_used": "使用的模型",
|
| 284 |
+
"answer": "答案",
|
| 285 |
+
"sources": "来源",
|
| 286 |
+
"no_sources_found": "未找到特定来源。这可能是一般性回答。",
|
| 287 |
+
"error_querying_docs": "查询文档时出错",
|
| 288 |
+
"ready_for_queries": "准备查询!前往'搜索与查询'选项卡开始提问。",
|
| 289 |
+
|
| 290 |
+
# Interface elements
|
| 291 |
+
"initialize_system": "初始化系统",
|
| 292 |
+
"initialize_systems": "初始化系统",
|
| 293 |
+
"initialization_status": "初始化状态",
|
| 294 |
+
},
|
| 295 |
+
|
| 296 |
+
"Malay": {
|
| 297 |
+
# Navigation
|
| 298 |
+
"search_universities": "🔍 Cari Universiti",
|
| 299 |
+
"upload_documents": "📄 Muat Naik Dokumen",
|
| 300 |
+
"manage_documents": "🗂 Urus Dokumen",
|
| 301 |
+
"about": "ℹ️ Mengenai Top.Edu",
|
| 302 |
+
"navigation": "🎯 Navigasi",
|
| 303 |
+
|
| 304 |
+
# Main header
|
| 305 |
+
"app_title": "🎓 Top.Edu",
|
| 306 |
+
"app_subtitle": "Buka Pendidikan ASEAN dengan Carian Berkuasa AI",
|
| 307 |
+
|
| 308 |
+
# Search page
|
| 309 |
+
"search_header": "🔍 Cari Maklumat Universiti",
|
| 310 |
+
"search_description": "Tanya tentang kemasukan, yuran, biasiswa, dan program:",
|
| 311 |
+
"language_label": "Bahasa Respons",
|
| 312 |
+
"your_question": "Soalan anda:",
|
| 313 |
+
"placeholder_text": "cth: Ijazah Sarjana di Malaysia di bawah 40,000 RMB/tahun",
|
| 314 |
+
"example_queries": "💡 Lihat Contoh Soalan",
|
| 315 |
+
"complex_queries": "🧠 Soalan Kompleks (Menggunakan Model Penaakulan)",
|
| 316 |
+
"simple_queries": "⚡ Soalan Mudah (Menggunakan Model Arahan)",
|
| 317 |
+
"advanced_filters": "🔧 Penapis Lanjutan (Pilihan)",
|
| 318 |
+
"budget_range": "Julat bajet (mata wang tempatan/tahun)",
|
| 319 |
+
"study_level": "Tahap Pengajian",
|
| 320 |
+
"preferred_countries": "Negara Pilihan",
|
| 321 |
+
"search_button": "🔍 Cari",
|
| 322 |
+
"ready_to_search": "✅ Sedia untuk cari! Klik butang cari bila anda sedia.",
|
| 323 |
+
"enter_question": "💭 Masukkan soalan anda di kotak teks di atas untuk mula mencari.",
|
| 324 |
+
"using_example": "📝 Menggunakan contoh:",
|
| 325 |
+
"responses_in": "🌐 Respons akan dalam bahasa",
|
| 326 |
+
|
| 327 |
+
# Upload page
|
| 328 |
+
"upload_header": "📄 Muat Naik Dokumen Universiti",
|
| 329 |
+
"upload_description": "Muat naik dokumen PDF rasmi yang mengandungi keperluan kemasukan universiti, yuran, dan maklumat program.",
|
| 330 |
+
"university_name": "Nama Universiti",
|
| 331 |
+
"country": "Negara",
|
| 332 |
+
"document_type": "Jenis Dokumen",
|
| 333 |
+
"choose_files": "Pilih fail PDF",
|
| 334 |
+
"drag_drop": "Seret dan lepas fail di sini",
|
| 335 |
+
"file_limit": "Had 200MB setiap fail • PDF",
|
| 336 |
+
"browse_files": "Layari fail",
|
| 337 |
+
"process_documents": "🚀 Proses Dokumen",
|
| 338 |
+
"processing_docs": "📄 Memproses dokumen...",
|
| 339 |
+
"successfully_processed": "🎉 Berjaya diproses",
|
| 340 |
+
"failed_to_process": "⚠️ Gagal diproses",
|
| 341 |
+
"documents": "dokumen",
|
| 342 |
+
"no_docs_processed": "Tiada dokumen yang berjaya diproses.",
|
| 343 |
+
|
| 344 |
+
# Document types
|
| 345 |
+
"admission_requirements": "Keperluan Kemasukan",
|
| 346 |
+
"tuition_fees": "Yuran Pengajian & Kos",
|
| 347 |
+
"program_information": "Maklumat Program",
|
| 348 |
+
"scholarship_info": "Maklumat Biasiswa",
|
| 349 |
+
"application_deadlines": "Tarikh Tutup Permohonan",
|
| 350 |
+
"general_info": "Maklumat Umum",
|
| 351 |
+
|
| 352 |
+
# Manage documents page
|
| 353 |
+
"manage_header": "🗂 Urus Dokumen",
|
| 354 |
+
"manage_description": "Lihat dan urus dokumen universiti yang dimuat naik dalam pangkalan pengetahuan anda.",
|
| 355 |
+
"total_documents": "Jumlah Dokumen",
|
| 356 |
+
"total_chunks": "Jumlah Bahagian Teks",
|
| 357 |
+
"storage_size": "Saiz Storan",
|
| 358 |
+
"last_updated": "Kemaskini Terakhir",
|
| 359 |
+
"document_list": "📚 Senarai Dokumen",
|
| 360 |
+
"no_documents": "Tiada dokumen dijumpai. Muat naik beberapa dokumen dahulu!",
|
| 361 |
+
"delete_all": "🗑️ Padam Semua Dokumen",
|
| 362 |
+
"confirm_delete": "⚠️ Adakah anda pasti mahu memadam SEMUA dokumen? Tindakan ini tidak boleh dibatalkan.",
|
| 363 |
+
"yes_delete": "Ya, Padam Semua",
|
| 364 |
+
"documents_deleted": "Semua dokumen telah dipadam.",
|
| 365 |
+
|
| 366 |
+
# About page
|
| 367 |
+
"about_header": "Tentang Top.Edu",
|
| 368 |
+
"what_we_do": "🎯 Apa Yang Kami Lakukan",
|
| 369 |
+
"what_we_do_description": "Top.Edu membantu pelajar di seluruh dunia untuk mencari maklumat tepat dan terkini mengenai universiti di Asia Tenggara dengan mudah. Platform kami menggabungkan dokumen rasmi universiti dan menggunakan AI untuk menjawab soalan tentang program, yuran pengajian, syarat kemasukan, dan tarikh akhir permohonan.",
|
| 370 |
+
"who_we_are": "💡 Siapa Kami",
|
| 371 |
+
"who_we_are_description": "Kami adalah pasukan yang berminat dalam pendidikan dan AI, berdedikasi untuk menjadikan pendidikan tinggi lebih mudah diakses dan telus. Misi kami adalah mempermudah proses carian dan memberikan panduan yang boleh dipercayai kepada pelajar yang ingin belajar di luar negara.",
|
| 372 |
+
"contact": "📞 Hubungi & Sokongan",
|
| 373 |
+
"supported_languages": "🌏 Bahasa Disokong",
|
| 374 |
+
|
| 375 |
+
# Countries
|
| 376 |
+
"singapore": "Singapura",
|
| 377 |
+
"malaysia": "Malaysia",
|
| 378 |
+
"thailand": "Thailand",
|
| 379 |
+
"indonesia": "Indonesia",
|
| 380 |
+
"philippines": "Filipina",
|
| 381 |
+
"vietnam": "Vietnam",
|
| 382 |
+
"brunei": "Brunei",
|
| 383 |
+
|
| 384 |
+
# Study levels
|
| 385 |
+
"diploma": "Diploma",
|
| 386 |
+
"bachelor": "Sarjana Muda",
|
| 387 |
+
"master": "Sarjana",
|
| 388 |
+
"phd": "PhD",
|
| 389 |
+
|
| 390 |
+
# Budget options
|
| 391 |
+
"any": "Mana-mana",
|
| 392 |
+
"under_10k": "<10k",
|
| 393 |
+
"10k_20k": "10k-20k",
|
| 394 |
+
"20k_30k": "20k-30k",
|
| 395 |
+
"30k_40k": "30k-40k",
|
| 396 |
+
"over_40k": ">40k",
|
| 397 |
+
|
| 398 |
+
# Example queries
|
| 399 |
+
"example_complex_1": "Tunjukkan saya universiti di Malaysia untuk ijazah sarjana dengan yuran di bawah 40,000 RMB setahun",
|
| 400 |
+
"example_complex_2": "Bandingkan program kejuruteraan di Thailand dan Singapura di bawah $15,000 setahun",
|
| 401 |
+
"example_complex_3": "Cari program MBA di ASEAN dengan keperluan GMAT dan biasiswa tersedia",
|
| 402 |
+
"example_complex_4": "Universiti di Indonesia dengan program bahasa Inggeris dan tanpa keperluan IELTS",
|
| 403 |
+
"example_simple_1": "Apakah maksud IELTS?",
|
| 404 |
+
"example_simple_2": "Apakah perbezaan antara ijazah sarjana muda dan sarjana?",
|
| 405 |
+
"example_simple_3": "Bagaimana untuk memohon visa pelajar?",
|
| 406 |
+
"example_simple_4": "Dokumen apakah yang diperlukan untuk permohonan universiti?",
|
| 407 |
+
},
|
| 408 |
+
|
| 409 |
+
"ไทย": {
|
| 410 |
+
# Navigation
|
| 411 |
+
"search_universities": "🔍 ค้นหามหาวิทยาลัย",
|
| 412 |
+
"upload_documents": "📄 อัพโหลดเอกสาร",
|
| 413 |
+
"manage_documents": "🗂 จัดการเอกสาร",
|
| 414 |
+
"about": "ℹ️ เกี่ยวกับ Top.Edu",
|
| 415 |
+
"navigation": "🎯 เมนูหลัก",
|
| 416 |
+
|
| 417 |
+
# Main header
|
| 418 |
+
"app_title": "🎓 Top.Edu",
|
| 419 |
+
"app_subtitle": "ปลดล็อกการศึกษาอาเซียนด้วยการค้นหาที่ขับเคลื่อนด้วย AI",
|
| 420 |
+
|
| 421 |
+
# Search page
|
| 422 |
+
"search_header": "🔍 ค้นหาข้อมูลมหาวิทยาลัย",
|
| 423 |
+
"search_description": "ถามเกี่ยวกับการเข้าเรียน ค่าใช้จ่าย ทุนการศึกษา และหลักสูตร:",
|
| 424 |
+
"language_label": "ภาษาการตอบ",
|
| 425 |
+
"your_question": "คำถามของคุณ:",
|
| 426 |
+
"placeholder_text": "เช่น ปริญญาโทในมาเลเซียต่ำกว่า 40,000 หยวนจีน/ปี",
|
| 427 |
+
"example_queries": "💡 ดูตัวอย่างคำถาม",
|
| 428 |
+
"complex_queries": "🧠 คำถามซับซ้อน (ใช้โมเดลการให้เหตุผล)",
|
| 429 |
+
"simple_queries": "⚡ คำถามง่าย (ใช้โมเดลคำสั่ง)",
|
| 430 |
+
"advanced_filters": "🔧 ตัวกรองขั้นสูง (ตัวเลือก)",
|
| 431 |
+
"budget_range": "ช่วงงบประมาณ (สกุลเงินท้องถิ่น/ปี)",
|
| 432 |
+
"study_level": "ระดับการศึกษา",
|
| 433 |
+
"preferred_countries": "ประเทศที่ต้องการ",
|
| 434 |
+
"search_button": "🔍 ค้นหา",
|
| 435 |
+
"ready_to_search": "✅ พร้อมค้นหา! คลิกปุ่มค้นหาเมื่อคุณพร้อม",
|
| 436 |
+
"enter_question": "💭 ใส่คำถามในกล่องข้อความด้านบนเพื่อเริ่มค้นหา",
|
| 437 |
+
"using_example": "📝 ใช้ตัวอย่าง:",
|
| 438 |
+
"responses_in": "🌐 จะตอบเป็นภาษา",
|
| 439 |
+
|
| 440 |
+
# Upload page
|
| 441 |
+
"upload_header": "📄 อัพโหลดเอกสารมหาวิทยาลัย",
|
| 442 |
+
"upload_description": "อัพโหลดเอกสาร PDF ที่มีข้อกำหนดการรับเข้า ค่าธรรมเนียม และข้อมูลหลักสูตร",
|
| 443 |
+
"university_name": "ชื่อมหาวิทยาลัย",
|
| 444 |
+
"country": "ประเทศ",
|
| 445 |
+
"document_type": "ประเภทเอกสาร",
|
| 446 |
+
"choose_files": "เลือกไฟล์ PDF",
|
| 447 |
+
"file_limit": "จำกัด 200MB ต่อไฟล์ • PDF",
|
| 448 |
+
"process_documents": "🚀 ประมวลผลเอกสาร",
|
| 449 |
+
"processing_docs": "📄 กำลังประมวลผลเอกสาร...",
|
| 450 |
+
"successfully_processed": "🎉 ประมวลผลสำเร็จ",
|
| 451 |
+
"failed_to_process": "⚠️ ประมวลผลล้มเหลว",
|
| 452 |
+
"documents": "เอกสาร",
|
| 453 |
+
"no_docs_processed": "ไม่สามารถประมวลผลเอกสารใดได้สำเร็จ",
|
| 454 |
+
|
| 455 |
+
# Document types
|
| 456 |
+
"admission_requirements": "ข้อกำหนดการรับเข้า",
|
| 457 |
+
"tuition_fees": "ค่าธรรมเนียมการศึกษา",
|
| 458 |
+
"program_information": "ข้อมูลหลักสูตร",
|
| 459 |
+
"scholarship_info": "ข้อมูลทุนการศึกษา",
|
| 460 |
+
"application_deadlines": "กำหนดส่งใบสมัคร",
|
| 461 |
+
"general_info": "ข้อมูลทั่วไป",
|
| 462 |
+
|
| 463 |
+
# Manage documents page
|
| 464 |
+
"manage_header": "🗂 จัดกา��เอกสาร",
|
| 465 |
+
"manage_description": "ดูและจัดการเอกสารมหาวิทยาลัยในฐานข้อมูลของคุณ",
|
| 466 |
+
"total_documents": "เอกสารทั้งหมด",
|
| 467 |
+
"total_chunks": "ส่วนข้อความทั้งหมด",
|
| 468 |
+
"storage_size": "ขนาดที่เก็บข้อมูล",
|
| 469 |
+
"last_updated": "อัปเดตล่าสุด",
|
| 470 |
+
"document_list": "📚 รายการเอกสาร",
|
| 471 |
+
"no_documents": "ไม่พบเอกสาร อัปโหลดเอกสารก่อน!",
|
| 472 |
+
"delete_all": "🗑️ ลบเอกสารทั้งหมด",
|
| 473 |
+
"documents_deleted": "เอกสารทั้งหมดถูกลบแล้ว",
|
| 474 |
+
|
| 475 |
+
# About page
|
| 476 |
+
"about_header": "เกี่ยวกับ Top.Edu",
|
| 477 |
+
"what_we_do": "🎯 สิ่งที่เราทำ",
|
| 478 |
+
"what_we_do_description": "Top.Edu ช่วยให้นักเรียนทั่วโลกสามารถค้นหาข้อมูลที่ถูกต้องและทันสมัยเกี่ยวกับมหาวิทยาลัยในเอเชียตะวันออกเฉียงใต้ได้อย่างง่ายดาย แพลตฟอร์มของเรารวบรวมเอกสารทางการของมหาวิทยาลัยและใช้ AI เพื่อตอบคำถามเกี่ยวกับหลักสูตร ค่าเล่าเรียน ข้อกำหนดการเข้าเรียน และกำหนดส่งใบสมัคร",
|
| 479 |
+
"who_we_are": "💡 เราคือใคร",
|
| 480 |
+
"who_we_are_description": "เราคือทีมผู้ที่ชื่นชอบการศึกษาและ AI มุ่งมั่นที่จะทำให้การศึกษาระดับอุดมศึกษาสามารถเข้าถึงได้และโปร่งใส ภารกิจของเราคือทำให้กระบวนการค้นหาง่ายขึ้นและให้คำแนะนำที่เชื่อถือได้แก่นักเรียนที่ต้องการศึกษาต่อต่างประเทศ",
|
| 481 |
+
"contact": "📞 ติดต่อ & สนับสนุน",
|
| 482 |
+
"supported_languages": "🌏 ภาษาที่รองรับ",
|
| 483 |
+
|
| 484 |
+
# Countries
|
| 485 |
+
"singapore": "สิงคโปร์",
|
| 486 |
+
"malaysia": "มาเลเซีย",
|
| 487 |
+
"thailand": "ไทย",
|
| 488 |
+
"indonesia": "อินโดนีเซีย",
|
| 489 |
+
"philippines": "ฟิลิปปินส์",
|
| 490 |
+
"vietnam": "เวียดนาม",
|
| 491 |
+
"brunei": "บรูไน",
|
| 492 |
+
|
| 493 |
+
# Study levels
|
| 494 |
+
"diploma": "ประกาศนียบัตร",
|
| 495 |
+
"bachelor": "ปริญญาตรี",
|
| 496 |
+
"master": "ปริญญาโท",
|
| 497 |
+
"phd": "ปริญญาเอก",
|
| 498 |
+
|
| 499 |
+
# Budget options
|
| 500 |
+
"any": "ใดก็ได้",
|
| 501 |
+
"under_10k": "<10k",
|
| 502 |
+
"10k_20k": "10k-20k",
|
| 503 |
+
"20k_30k": "20k-30k",
|
| 504 |
+
"30k_40k": "30k-40k",
|
| 505 |
+
"over_40k": ">40k",
|
| 506 |
+
|
| 507 |
+
# Example queries
|
| 508 |
+
"example_complex_1": "แสดงมหาวิทยาลัยในมาเลเซียสำหรับปริญญาโทที่มีค่าเล่าเรียนต่ำกว่า 40,000 หยวนต่อปี",
|
| 509 |
+
"example_complex_2": "เปรียบเทียบหลักสูตรวิศวกรรมในไทยและสิงคโปร์ต่ำกว่า $15,000 ต่อปี",
|
| 510 |
+
"example_complex_3": "ค้นหาหลักสูตร MBA ในอาเซียนที่มีข้อกำหนด GMAT และทุนการศึกษา",
|
| 511 |
+
"example_complex_4": "มหาวิทยาลัยในอินโดนีเซียที่มีหลักสูตรสอนภาษาอังกฤษและไม่ต้องการ IELTS",
|
| 512 |
+
"example_simple_1": "IELTS ย่อมาจากอะไร?",
|
| 513 |
+
"example_simple_2": "ความแตกต่างระหว่างปริญญาตรีและปริญญาโทคืออะไร?",
|
| 514 |
+
"example_simple_3": "วิธีสมัครวีซ่านักเรียนอย่างไร?",
|
| 515 |
+
"example_simple_4": "เอกสารอะไรบ้างที่จำเป็นสำหรับการสมัครเข้ามหาวิทยาลัย?",
|
| 516 |
+
},
|
| 517 |
+
|
| 518 |
+
"Indonesian": {
|
| 519 |
+
# Navigation
|
| 520 |
+
"search_universities": "�� Cari Universitas",
|
| 521 |
+
"upload_documents": "📄 Unggah Dokumen",
|
| 522 |
+
"manage_documents": "🗂 Kelola Dokumen",
|
| 523 |
+
"about": "ℹ️ Tentang Top.Edu",
|
| 524 |
+
"navigation": "🎯 Navigasi",
|
| 525 |
+
|
| 526 |
+
# Main header
|
| 527 |
+
"app_title": "🎓 Top.Edu",
|
| 528 |
+
"app_subtitle": "Buka Pendidikan ASEAN dengan Pencarian Bertenaga AI",
|
| 529 |
+
|
| 530 |
+
# Search page
|
| 531 |
+
"search_header": "🔍 Cari Informasi Universitas",
|
| 532 |
+
"search_description": "Tanya tentang penerimaan, biaya, beasiswa, dan program:",
|
| 533 |
+
"language_label": "Bahasa Respon",
|
| 534 |
+
"your_question": "Pertanyaan Anda:",
|
| 535 |
+
"placeholder_text": "mis: Magister di Malaysia di bawah 40.000 RMB/tahun",
|
| 536 |
+
"example_queries": "💡 Lihat Contoh Pertanyaan",
|
| 537 |
+
"complex_queries": "🧠 Pertanyaan Kompleks (Menggunakan Model Penalaran)",
|
| 538 |
+
"simple_queries": "⚡ Pertanyaan Sederhana (Menggunakan Model Instruksi)",
|
| 539 |
+
"advanced_filters": "🔧 Filter Lanjutan (Opsional)",
|
| 540 |
+
"budget_range": "Rentang anggaran (mata uang lokal/tahun)",
|
| 541 |
+
"study_level": "Tingkat Studi",
|
| 542 |
+
"preferred_countries": "Negara Pilihan",
|
| 543 |
+
"search_button": "🔍 Cari",
|
| 544 |
+
"ready_to_search": "✅ Siap mencari! Klik tombol cari saat Anda siap.",
|
| 545 |
+
"enter_question": "💭 Masukkan pertanyaan di kotak teks di atas untuk mulai mencari.",
|
| 546 |
+
"using_example": "📝 Menggunakan contoh:",
|
| 547 |
+
"responses_in": "🌐 Respon akan dalam bahasa",
|
| 548 |
+
|
| 549 |
+
# Upload page
|
| 550 |
+
"upload_header": "📄 Unggah Dokumen Universitas",
|
| 551 |
+
"upload_description": "Unggah dokumen PDF berisi persyaratan masuk universitas, biaya, dan informasi program.",
|
| 552 |
+
"university_name": "Nama Universitas",
|
| 553 |
+
"country": "Negara",
|
| 554 |
+
"document_type": "Jenis Dokumen",
|
| 555 |
+
"choose_files": "Pilih file PDF",
|
| 556 |
+
"file_limit": "Batas 200MB per file • PDF",
|
| 557 |
+
"process_documents": "🚀 Proses Dokumen",
|
| 558 |
+
"processing_docs": "📄 Memproses dokumen...",
|
| 559 |
+
"successfully_processed": "🎉 Berhasil diproses",
|
| 560 |
+
"failed_to_process": "⚠️ Gagal diproses",
|
| 561 |
+
"documents": "dokumen",
|
| 562 |
+
"no_docs_processed": "Tidak ada dokumen yang berhasil diproses.",
|
| 563 |
+
|
| 564 |
+
# Document types
|
| 565 |
+
"admission_requirements": "Persyaratan Masuk",
|
| 566 |
+
"tuition_fees": "Biaya Kuliah & Biaya",
|
| 567 |
+
"program_information": "Informasi Program",
|
| 568 |
+
"scholarship_info": "Informasi Beasiswa",
|
| 569 |
+
"application_deadlines": "Batas Waktu Aplikasi",
|
| 570 |
+
"general_info": "Informasi Umum",
|
| 571 |
+
|
| 572 |
+
# Manage documents page
|
| 573 |
+
"manage_header": "🗂 Kelola Dokumen",
|
| 574 |
+
"manage_description": "Lihat dan kelola dokumen universitas dalam basis pengetahuan Anda.",
|
| 575 |
+
"total_documents": "Total Dokumen",
|
| 576 |
+
"total_chunks": "Total Bagian Teks",
|
| 577 |
+
"storage_size": "Ukuran Penyimpanan",
|
| 578 |
+
"last_updated": "Terakhir Diperbarui",
|
| 579 |
+
"document_list": "📚 Daftar Dokumen",
|
| 580 |
+
"no_documents": "Tidak ada dokumen ditemukan. Unggah beberapa dokumen terlebih dahulu!",
|
| 581 |
+
"delete_all": "🗑️ Hapus Semua Dokumen",
|
| 582 |
+
"documents_deleted": "Semua dokumen telah dihapus.",
|
| 583 |
+
|
| 584 |
+
# About page
|
| 585 |
+
"about_header": "Tentang Top.Edu",
|
| 586 |
+
"what_we_do": "🎯 Apa Yang Kami Lakukan",
|
| 587 |
+
"what_we_do_description": "Top.Edu membantu mahasiswa di seluruh dunia dengan mudah menemukan informasi yang akurat dan terbaru tentang universitas di Asia Tenggara. Platform kami mengumpulkan dokumen resmi universitas dan menggunakan AI untuk menjawab pertanyaan tentang program, biaya kuliah, persyaratan masuk, dan tenggat waktu pendaftaran.",
|
| 588 |
+
"who_we_are": "💡 Siapa Kami",
|
| 589 |
+
"who_we_are_description": "Kami adalah tim yang bersemangat di bidang pendidikan dan AI, berdedikasi untuk membuat pendidikan tinggi lebih mudah diakses dan transparan. Misi kami adalah menyederhanakan proses pencarian dan memberikan panduan terpercaya bagi mahasiswa yang ingin belajar di luar negeri.",
|
| 590 |
+
"contact": "📞 Kontak & Dukungan",
|
| 591 |
+
"supported_languages": "🌏 Bahasa Didukung",
|
| 592 |
+
|
| 593 |
+
# Countries
|
| 594 |
+
"singapore": "Singapura",
|
| 595 |
+
"malaysia": "Malaysia",
|
| 596 |
+
"thailand": "Thailand",
|
| 597 |
+
"indonesia": "Indonesia",
|
| 598 |
+
"philippines": "Filipina",
|
| 599 |
+
"vietnam": "Vietnam",
|
| 600 |
+
"brunei": "Brunei",
|
| 601 |
+
|
| 602 |
+
# Study levels
|
| 603 |
+
"diploma": "Diploma",
|
| 604 |
+
"bachelor": "Sarjana",
|
| 605 |
+
"master": "Magister",
|
| 606 |
+
"phd": "Doktor",
|
| 607 |
+
|
| 608 |
+
# Budget options
|
| 609 |
+
"any": "Apa saja",
|
| 610 |
+
"under_10k": "<10k",
|
| 611 |
+
"10k_20k": "10k-20k",
|
| 612 |
+
"20k_30k": "20k-30k",
|
| 613 |
+
"30k_40k": "30k-40k",
|
| 614 |
+
"over_40k": ">40k",
|
| 615 |
+
|
| 616 |
+
# Example queries
|
| 617 |
+
"example_complex_1": "Tunjukkan universitas di Malaysia untuk gelar magister dengan biaya kuliah di bawah 40.000 RMB per tahun",
|
| 618 |
+
"example_complex_2": "Bandingkan program teknik di Thailand dan Singapura di bawah $15.000 per tahun",
|
| 619 |
+
"example_complex_3": "Cari program MBA di ASEAN dengan persyaratan GMAT dan beasiswa tersedia",
|
| 620 |
+
"example_complex_4": "Universitas di Indonesia dengan program berbahasa Inggris tanpa persyaratan IELTS",
|
| 621 |
+
"example_simple_1": "Apa kepanjangan dari IELTS?",
|
| 622 |
+
"example_simple_2": "Apa perbedaan antara gelar sarjana dan magister?",
|
| 623 |
+
"example_simple_3": "Bagaimana cara mengajukan visa pelajar?",
|
| 624 |
+
"example_simple_4": "Dokumen apa saja yang diperlukan untuk aplikasi universitas?",
|
| 625 |
+
},
|
| 626 |
+
|
| 627 |
+
"Tiếng Việt": {
|
| 628 |
+
# Navigation
|
| 629 |
+
"search_universities": "🔍 Tìm kiếm Trường đại học",
|
| 630 |
+
"upload_documents": "📄 Tải lên Tài liệu",
|
| 631 |
+
"manage_documents": "🗂 Quản lý Tài liệu",
|
| 632 |
+
"about": "ℹ️ Về Top.Edu",
|
| 633 |
+
"navigation": "🎯 Điều hướng",
|
| 634 |
+
|
| 635 |
+
# Main header
|
| 636 |
+
"app_title": "🎓 Top.Edu",
|
| 637 |
+
"app_subtitle": "Mở khóa Giáo dục ASEAN với Tìm kiếm được hỗ trợ bởi AI",
|
| 638 |
+
|
| 639 |
+
# Search page
|
| 640 |
+
"search_header": "🔍 Tìm kiếm Thông tin Trường đại học",
|
| 641 |
+
"search_description": "Hỏi về tuyển sinh, học phí, học bổng và chương trình:",
|
| 642 |
+
"language_label": "Ngôn ngữ Phản hồi",
|
| 643 |
+
"your_question": "Câu hỏi của bạn:",
|
| 644 |
+
"placeholder_text": "vd: Thạc sĩ tại Malaysia dưới 40.000 tệ/năm",
|
| 645 |
+
"example_queries": "💡 Xem Câu hỏi Mẫu",
|
| 646 |
+
"complex_queries": "🧠 Câu hỏi Phức tạp (Sử dụng Mô hình Lý luận)",
|
| 647 |
+
"simple_queries": "⚡ Câu hỏi Đơn giản (Sử dụng Mô hình Hướng dẫn)",
|
| 648 |
+
"advanced_filters": "🔧 Bộ lọc Nâng cao (Tùy chọn)",
|
| 649 |
+
"budget_range": "Khoảng ngân sách (tiền tệ địa phương/năm)",
|
| 650 |
+
"study_level": "Bậc học",
|
| 651 |
+
"preferred_countries": "Quốc gia Ưa thích",
|
| 652 |
+
"search_button": "🔍 Tìm kiếm",
|
| 653 |
+
"ready_to_search": "✅ Sẵn sàng tìm kiếm! Nhấp vào nút tìm kiếm khi bạn sẵn sàng.",
|
| 654 |
+
"enter_question": "💭 Nhập câu hỏi vào hộp văn bản ở trên để bắt đầu tìm kiếm.",
|
| 655 |
+
"using_example": "📝 Sử dụng ví dụ:",
|
| 656 |
+
"responses_in": "🌐 Phản hồi sẽ bằng tiếng",
|
| 657 |
+
|
| 658 |
+
# Upload page
|
| 659 |
+
"upload_header": "📄 Tải lên Tài liệu Đại học",
|
| 660 |
+
"upload_description": "Tải lên tài liệu PDF chứa yêu cầu tuyển sinh, học phí và thông tin chương trình của đại học.",
|
| 661 |
+
"university_name": "Tên Đại học",
|
| 662 |
+
"country": "Quốc gia",
|
| 663 |
+
"document_type": "Loại Tài liệu",
|
| 664 |
+
"choose_files": "Chọn file PDF",
|
| 665 |
+
"file_limit": "Giới hạn 200MB mỗi file • PDF",
|
| 666 |
+
"process_documents": "🚀 Xử lý Tài liệu",
|
| 667 |
+
"processing_docs": "📄 Đang xử lý tài liệu...",
|
| 668 |
+
"successfully_processed": "🎉 Xử lý thành công",
|
| 669 |
+
"failed_to_process": "⚠️ Xử lý thất bại",
|
| 670 |
+
"documents": "tài liệu",
|
| 671 |
+
"no_docs_processed": "Không có tài liệu nào được xử lý thành công.",
|
| 672 |
+
|
| 673 |
+
# Document types
|
| 674 |
+
"admission_requirements": "Yêu cầu Tuyển sinh",
|
| 675 |
+
"tuition_fees": "Học phí & Chi phí",
|
| 676 |
+
"program_information": "Thông tin Chương trình",
|
| 677 |
+
"scholarship_info": "Thông tin Học bổng",
|
| 678 |
+
"application_deadlines": "Hạn nộp Đơn",
|
| 679 |
+
"general_info": "Thông tin Chung",
|
| 680 |
+
|
| 681 |
+
# Manage documents page
|
| 682 |
+
"manage_header": "🗂 Quản lý Tài liệu",
|
| 683 |
+
"manage_description": "Xem và quản lý tài liệu đại học trong cơ sở tri thức của bạn.",
|
| 684 |
+
"total_documents": "Tổng Tài liệu",
|
| 685 |
+
"total_chunks": "Tổng Đoạn Văn bản",
|
| 686 |
+
"storage_size": "Kích thước Lưu trữ",
|
| 687 |
+
"last_updated": "Cập nhật Cuối",
|
| 688 |
+
"document_list": "📚 Danh sách Tài liệu",
|
| 689 |
+
"no_documents": "Không tìm thấy tài liệu. Hãy tải lên một số tài liệu trước!",
|
| 690 |
+
"delete_all": "🗑️ Xóa Tất cả Tài liệu",
|
| 691 |
+
"documents_deleted": "Tất cả tài liệu đã được xóa.",
|
| 692 |
+
|
| 693 |
+
# About page
|
| 694 |
+
"about_header": "Về Top.Edu",
|
| 695 |
+
"what_we_do": "🎯 Chúng tôi làm gì",
|
| 696 |
+
"what_we_do_description": "Top.Edu giúp sinh viên toàn cầu dễ dàng tìm thông tin chính xác và cập nhật về các trường đại học ở Đông Nam Á. Nền tảng của chúng tôi tổng hợp các tài liệu chính thức của trường và sử dụng AI để trả lời các câu hỏi về chương trình học, học phí, yêu cầu đầu vào và hạn nộp hồ sơ.",
|
| 697 |
+
"who_we_are": "💡 Chúng tôi là ai",
|
| 698 |
+
"who_we_are_description": "Chúng tôi là nhóm những người đam mê giáo dục và AI, cam kết làm cho giáo dục đại học trở nên dễ tiếp cận và minh bạch. Sứ mệnh của chúng tôi là đơn giản hóa quá trình tìm kiếm và cung cấp hướng dẫn đáng tin cậy cho sinh viên muốn du học.",
|
| 699 |
+
"contact": "📞 Liên hệ & Hỗ trợ",
|
| 700 |
+
"supported_languages": "🌏 Ngôn ngữ được hỗ trợ",
|
| 701 |
+
# Countries
|
| 702 |
+
"singapore": "Singapore",
|
| 703 |
+
"malaysia": "Malaysia",
|
| 704 |
+
"thailand": "Thái Lan",
|
| 705 |
+
"indonesia": "Indonesia",
|
| 706 |
+
"philippines": "Philippines",
|
| 707 |
+
"vietnam": "Việt Nam",
|
| 708 |
+
"brunei": "Brunei",
|
| 709 |
+
|
| 710 |
+
# Study levels
|
| 711 |
+
"diploma": "Chứng chỉ",
|
| 712 |
+
"bachelor": "Cử nhân",
|
| 713 |
+
"master": "Thạc sĩ",
|
| 714 |
+
"phd": "Tiến sĩ",
|
| 715 |
+
|
| 716 |
+
# Budget options
|
| 717 |
+
"any": "Bất kỳ",
|
| 718 |
+
"under_10k": "<10k",
|
| 719 |
+
"10k_20k": "10k-20k",
|
| 720 |
+
"20k_30k": "20k-30k",
|
| 721 |
+
"30k_40k": "30k-40k",
|
| 722 |
+
"over_40k": ">40k",
|
| 723 |
+
|
| 724 |
+
# Example queries
|
| 725 |
+
"example_complex_1": "Cho tôi xem các trường đại học ở Malaysia cho bằng thạc sĩ với học phí dưới 40.000 tệ/năm",
|
| 726 |
+
"example_complex_2": "So sánh các chương trình kỹ thuật ở Thái Lan và Singapore dưới $15.000/năm",
|
| 727 |
+
"example_complex_3": "Tìm các chương trình MBA ở ASEAN có yêu cầu GMAT và học bổng",
|
| 728 |
+
"example_complex_4": "Các trường đại học ở Indonesia với chương trình giảng dạy bằng tiếng Anh không yêu cầu IELTS",
|
| 729 |
+
"example_simple_1": "IELTS là viết tắt của gì?",
|
| 730 |
+
"example_simple_2": "Sự khác biệt giữa bằng cử nhân và thạc sĩ là gì?",
|
| 731 |
+
"example_simple_3": "Làm thế nào để xin visa du học?",
|
| 732 |
+
"example_simple_4": "Những tài liệu gì cần thiết cho đơn xin học đại học?",
|
| 733 |
+
}
|
| 734 |
+
}
|
| 735 |
+
|
| 736 |
+
def get_text(key: str, lang: str = "English") -> str:
|
| 737 |
+
"""Get translated text for a given key and language"""
|
| 738 |
+
if lang in translations and key in translations[lang]:
|
| 739 |
+
return translations[lang][key]
|
| 740 |
+
# Fallback to English if translation not found
|
| 741 |
+
return translations["English"].get(key, key)
|
| 742 |
+
|
| 743 |
+
def get_language_code(display_name: str) -> str:
|
| 744 |
+
"""Convert display name to language code"""
|
| 745 |
+
language_map = {
|
| 746 |
+
"English": "English",
|
| 747 |
+
"中文 (Chinese)": "中文",
|
| 748 |
+
"Bahasa Malaysia": "Malay",
|
| 749 |
+
"ไทย (Thai)": "ไทย",
|
| 750 |
+
"Bahasa Indonesia": "Indonesian",
|
| 751 |
+
"Tiếng Việt (Vietnamese)": "Tiếng Việt"
|
| 752 |
+
}
|
| 753 |
+
return language_map.get(display_name, "English")
|