Upload 16 files
Browse files- .streamlit/config.toml +9 -0
- README.md +134 -10
- apt.txt +3 -0
- docker/Dockerfile +60 -0
- requirements.txt +6 -0
- scripts/list_models.py +76 -0
- scripts/test_huggingface.py +116 -0
- scripts/troubleshoot.py +263 -0
- src/__init__.py +4 -0
- src/app.py +439 -0
- src/chat_engine.py +228 -0
- src/config.py +106 -0
- src/models.py +10 -0
- src/pdf_export.py +192 -0
- src/utils.py +209 -0
- tests/test_chat_engine.py +10 -0
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
primaryColor = "#FF4B4B"
|
| 3 |
+
backgroundColor = "#FFFFFF"
|
| 4 |
+
secondaryBackgroundColor = "#F0F2F6"
|
| 5 |
+
textColor = "#262730"
|
| 6 |
+
font = "sans serif"
|
| 7 |
+
|
| 8 |
+
[server]
|
| 9 |
+
runOnSave = true
|
README.md
CHANGED
|
@@ -1,10 +1,134 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🎓 FINESE SCHOOL: AI Assistant for Data Professionals
|
| 2 |
+
|
| 3 |
+
Ask questions on Python, SQL, Power BI, ML, and more — get **accurate, topic-specific answers** with **code examples**.
|
| 4 |
+
|
| 5 |
+
✅ Powered by **Gemini**
|
| 6 |
+
✅ Download chat as PDF
|
| 7 |
+
✅ Strict topic enforcement
|
| 8 |
+
✅ Built with **Streamlit** & **LangChain**
|
| 9 |
+
|
| 10 |
+
## 🚀 Features
|
| 11 |
+
|
| 12 |
+
- **Expert-level responses** from "Dr. Data", our AI mentor with PhDs in CS and Statistics
|
| 13 |
+
- **Topic-specific knowledge** with strict enforcement to keep answers relevant
|
| 14 |
+
- **Code-rich explanations** with runnable examples
|
| 15 |
+
- **Best practices** and common pitfalls highlighted
|
| 16 |
+
- **PDF export** of entire sessions for offline reference
|
| 17 |
+
- **Beautiful UI** with dark/light mode support
|
| 18 |
+
|
| 19 |
+
## 🔧 Quick Setup
|
| 20 |
+
|
| 21 |
+
1. Copy `.env.example` to `.env`:
|
| 22 |
+
```bash
|
| 23 |
+
cp .env.example .env
|
| 24 |
+
```
|
| 25 |
+
Or on Windows:
|
| 26 |
+
```powershell
|
| 27 |
+
copy .env.example .env
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
2. Edit `.env` and add your API key:
|
| 31 |
+
|
| 32 |
+
For Google Gemini (default):
|
| 33 |
+
- Get your API key from [Google AI Studio](https://aistudio.google.com/)
|
| 34 |
+
- Set `GOOGLE_API_KEY=your_google_api_key_here`
|
| 35 |
+
- Optionally set `MODEL_NAME` to a specific model (defaults to "gemini-1.5-flash")
|
| 36 |
+
|
| 37 |
+
For Hugging Face (recommended for Hugging Face deployment):
|
| 38 |
+
- Get your API key from [Hugging Face](https://huggingface.co/settings/tokens)
|
| 39 |
+
- Set `HUGGINGFACE_API_KEY=your_huggingface_api_key_here`
|
| 40 |
+
- Set `API_TYPE=huggingface`
|
| 41 |
+
- Optionally set `MODEL_NAME` to a specific model (defaults to "mistralai/Mistral-7B-Instruct-v0.2")
|
| 42 |
+
|
| 43 |
+
For OpenAI (alternative):
|
| 44 |
+
- Get your API key from [OpenAI](https://platform.openai.com/api-keys)
|
| 45 |
+
- Set `OPENAI_API_KEY=your_openai_api_key_here`
|
| 46 |
+
- Set `API_TYPE=openai`
|
| 47 |
+
- Optionally set `MODEL_NAME` to a specific model (defaults to "gpt-3.5-turbo")
|
| 48 |
+
|
| 49 |
+
3. Install dependencies:
|
| 50 |
+
```bash
|
| 51 |
+
pip install -r requirements.txt
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
4. Run the app:
|
| 55 |
+
```bash
|
| 56 |
+
streamlit run src/app.py
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## 📌 Environment Variables
|
| 60 |
+
|
| 61 |
+
- `API_TYPE` (optional): The API provider to use. Options are "google", "huggingface", or "openai" (defaults to "huggingface")
|
| 62 |
+
- `GOOGLE_API_KEY` (required for Google): Your Google AI API key
|
| 63 |
+
- `HUGGINGFACE_API_KEY` (required for Hugging Face): Your Hugging Face API key
|
| 64 |
+
- `OPENAI_API_KEY` (required for OpenAI): Your OpenAI API key
|
| 65 |
+
- `MODEL_NAME` (optional): The model to use (defaults to provider-specific models)
|
| 66 |
+
- `TEMPERATURE` (optional): Model temperature (defaults to 0.3)
|
| 67 |
+
- `MAX_TOKENS` (optional): Maximum tokens in response (defaults to 2048)
|
| 68 |
+
- `IS_DOCKER` (optional): Set to "true" when running in Docker
|
| 69 |
+
|
| 70 |
+
## 🎯 Available Topics
|
| 71 |
+
|
| 72 |
+
1. **Python** - Core Python concepts, data structures, functions, decorators
|
| 73 |
+
2. **Data Analysis with Pandas & NumPy** - Data wrangling, vectorization, time series
|
| 74 |
+
3. **SQL** - ANSI SQL with focus on PostgreSQL/SQLite
|
| 75 |
+
4. **Power BI** - DAX formulas, data modeling, performance tuning
|
| 76 |
+
5. **Machine Learning** - Scikit-learn, model evaluation, feature engineering
|
| 77 |
+
6. **Deep Learning** - Neural networks with TensorFlow/PyTorch
|
| 78 |
+
7. **Data Visualization** - Effective static & interactive plots
|
| 79 |
+
|
| 80 |
+
## 🐳 Docker Support
|
| 81 |
+
|
| 82 |
+
To run in Docker:
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
docker build -t finesse-school .
|
| 86 |
+
docker run -p 8501:8501 -e HUGGINGFACE_API_KEY=your_key_here -e API_TYPE=huggingface finesse-school
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
## 🛠️ Troubleshooting
|
| 90 |
+
|
| 91 |
+
### Common issue: "model not found" (404)
|
| 92 |
+
|
| 93 |
+
If you see an error like:
|
| 94 |
+
|
| 95 |
+
```
|
| 96 |
+
Tutor error: 404 models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent.
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
Steps to resolve:
|
| 100 |
+
|
| 101 |
+
1. First, list available models for your API key:
|
| 102 |
+
```powershell
|
| 103 |
+
python .\scripts\list_models.py
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
2. Set `MODEL_NAME` to one of the available models from the list:
|
| 107 |
+
```powershell
|
| 108 |
+
$env:MODEL_NAME="gemini-1.0-pro" # Example - use an available model from the list
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
### Hugging Face API Key Issues
|
| 112 |
+
|
| 113 |
+
If you're getting an error like "You must provide an api_key", make sure:
|
| 114 |
+
|
| 115 |
+
1. You have set the `HUGGINGFACE_API_KEY` environment variable in your `.env` file
|
| 116 |
+
2. Your API key is valid and has "Read" permissions
|
| 117 |
+
3. You have set `API_TYPE=huggingface` in your `.env` file
|
| 118 |
+
|
| 119 |
+
Example of a correct `.env` file for Hugging Face:
|
| 120 |
+
```
|
| 121 |
+
API_TYPE=huggingface
|
| 122 |
+
HUGGINGFACE_API_KEY=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
| 123 |
+
MODEL_NAME=mistralai/Mistral-7B-Instruct-v0.2
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
### Other Issues
|
| 127 |
+
|
| 128 |
+
Run the troubleshooting script to diagnose common problems:
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
python .\scripts\troubleshoot.py
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
If you're still stuck, open an issue and include the output of `list_available_models()` and your `MODEL_NAME` value (do not include your API key).
|
apt.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wkhtmltopdf
|
| 2 |
+
xvfb
|
| 3 |
+
libfontconfig
|
docker/Dockerfile
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-stage build for optimization
|
| 2 |
+
FROM python:3.11-slim as builder
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
COPY apt.txt .
|
| 8 |
+
RUN apt-get update && \
|
| 9 |
+
xargs -a apt.txt apt-get install -y --no-install-recommends || echo "No apt.txt file or no packages to install" && \
|
| 10 |
+
apt-get clean && \
|
| 11 |
+
rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Create non-root user
|
| 14 |
+
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
| 15 |
+
|
| 16 |
+
# Install Python dependencies
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Final stage
|
| 21 |
+
FROM python:3.11-slim
|
| 22 |
+
|
| 23 |
+
# Copy system dependencies from builder stage
|
| 24 |
+
COPY --from=builder /usr/bin/wkhtmltopdf /usr/bin/wkhtmltopdf 2>/dev/null || echo "wkhtmltopdf not available"
|
| 25 |
+
COPY --from=builder /usr/bin/xvfb-run /usr/bin/xvfb-run 2>/dev/null || echo "xvfb-run not available"
|
| 26 |
+
COPY --from=builder /usr/lib/x86_64-linux-gnu/lib* /usr/lib/x86_64-linux-gnu/ 2>/dev/null || echo "No lib files to copy"
|
| 27 |
+
COPY --from=builder /usr/lib/lib* /usr/lib/ 2>/dev/null || echo "No lib files to copy"
|
| 28 |
+
COPY --from=builder /usr/share/wkhtmltopdf /usr/share/wkhtmltopdf 2>/dev/null || echo "wkhtmltopdf assets not available"
|
| 29 |
+
COPY --from=builder /etc/fonts /etc/fonts 2>/dev/null || echo "Font files not available"
|
| 30 |
+
COPY --from=builder /usr/share/fonts /usr/share/fonts 2>/dev/null || echo "Font files not available"
|
| 31 |
+
|
| 32 |
+
# Create non-root user
|
| 33 |
+
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
| 34 |
+
|
| 35 |
+
WORKDIR /app
|
| 36 |
+
|
| 37 |
+
# Copy Python dependencies from builder stage
|
| 38 |
+
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
|
| 39 |
+
|
| 40 |
+
# Copy application code
|
| 41 |
+
COPY . .
|
| 42 |
+
|
| 43 |
+
# Create .env file if it doesn't exist
|
| 44 |
+
RUN if [ ! -f .env ]; then cp .env.example .env; fi
|
| 45 |
+
|
| 46 |
+
# Change ownership to non-root user
|
| 47 |
+
RUN chown -R appuser:appuser /app
|
| 48 |
+
|
| 49 |
+
# Switch to non-root user
|
| 50 |
+
USER appuser
|
| 51 |
+
|
| 52 |
+
# Expose port
|
| 53 |
+
EXPOSE 8501
|
| 54 |
+
|
| 55 |
+
# Health check
|
| 56 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
| 57 |
+
CMD curl --fail http://localhost:8501/_stcore/health || exit 1
|
| 58 |
+
|
| 59 |
+
# Run app
|
| 60 |
+
ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
langchain-google-genai
|
| 3 |
+
langchain-huggingface
|
| 4 |
+
langchain-openai
|
| 5 |
+
pdfkit
|
| 6 |
+
python-dotenv
|
scripts/list_models.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""List available models for the configured GOOGLE_API_KEY.
|
| 3 |
+
|
| 4 |
+
Run:
|
| 5 |
+
$env:GOOGLE_API_KEY="your_key_here"
|
| 6 |
+
python .\scripts\list_models.py
|
| 7 |
+
|
| 8 |
+
This prints a sample of models you can set as MODEL_NAME.
|
| 9 |
+
"""
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
from langchain_google_genai import GoogleGenerativeAI
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print("Missing dependency: langchain_google_genai (or import failed):", e)
|
| 18 |
+
sys.exit(2)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main():
|
| 22 |
+
key = os.getenv("GOOGLE_API_KEY")
|
| 23 |
+
if not key:
|
| 24 |
+
print("ERROR: Set the GOOGLE_API_KEY environment variable before running this script.")
|
| 25 |
+
print("In PowerShell: $env:GOOGLE_API_KEY=\"your_key_here\"")
|
| 26 |
+
print("In Bash/Mac/Linux: export GOOGLE_API_KEY=\"your_key_here\"")
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
print("🔍 Connecting to Google Generative AI...")
|
| 30 |
+
client = GoogleGenerativeAI(google_api_key=key)
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
print("🔄 Fetching available models...")
|
| 34 |
+
models = client.list_models()
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print("❌ Failed to list models:", e)
|
| 37 |
+
print("\n💡 Troubleshooting tips:")
|
| 38 |
+
print(" 1. Check that your API key is valid and properly set")
|
| 39 |
+
print(" 2. Ensure you have internet connectivity")
|
| 40 |
+
print(" 3. Check if there are any firewall restrictions")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
print("\n✅ Available models:")
|
| 44 |
+
print("=" * 60)
|
| 45 |
+
|
| 46 |
+
# models may be a list of strings or dict-like objects; print readable representation
|
| 47 |
+
try:
|
| 48 |
+
model_list = []
|
| 49 |
+
for i, m in enumerate(models):
|
| 50 |
+
if hasattr(m, 'name'):
|
| 51 |
+
# It's likely a Model object
|
| 52 |
+
model_name = m.name.replace('models/', '') if m.name.startswith('models/') else m.name
|
| 53 |
+
model_list.append(model_name)
|
| 54 |
+
print(f"{i+1:2d}. {model_name}")
|
| 55 |
+
else:
|
| 56 |
+
# It's likely a string or dict
|
| 57 |
+
model_str = str(m)
|
| 58 |
+
model_name = model_str.replace('models/', '') if model_str.startswith('models/') else model_str
|
| 59 |
+
model_list.append(model_name)
|
| 60 |
+
print(f"{i+1:2d}. {model_name}")
|
| 61 |
+
|
| 62 |
+
print(f"\n📊 Total models found: {len(model_list)}")
|
| 63 |
+
except TypeError:
|
| 64 |
+
print(models)
|
| 65 |
+
|
| 66 |
+
print("\n📝 To use a specific model, set the MODEL_NAME environment variable:")
|
| 67 |
+
print("PowerShell: $env:MODEL_NAME=\"model_name_from_list\"")
|
| 68 |
+
print("Bash/Linux: export MODEL_NAME=\"model_name_from_list\"")
|
| 69 |
+
|
| 70 |
+
print("\n⭐ Recommended models:")
|
| 71 |
+
recommended = [m for m in model_list if any(r in m.lower() for r in ['gemini-1.5', 'gemini-pro', 'gemma'])]
|
| 72 |
+
for model in recommended[:5]:
|
| 73 |
+
print(f" • {model}")
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
main()
|
scripts/test_huggingface.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify Hugging Face API setup.
|
| 4 |
+
|
| 5 |
+
Run this script to check if your Hugging Face API key is working correctly:
|
| 6 |
+
python .\scripts\test_huggingface.py
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
|
| 13 |
+
# Load environment variables
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
def test_huggingface_api():
|
| 17 |
+
"""Test Hugging Face API connectivity."""
|
| 18 |
+
print("🔍 Testing Hugging Face API connectivity...")
|
| 19 |
+
|
| 20 |
+
# Check if API key is set
|
| 21 |
+
api_key = os.getenv("HUGGINGFACE_API_KEY")
|
| 22 |
+
if not api_key:
|
| 23 |
+
print("❌ HUGGINGFACE_API_KEY environment variable is not set")
|
| 24 |
+
print(" Please set your Hugging Face API key in the .env file")
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
print(f"✅ HUGGINGFACE_API_KEY is set (length: {len(api_key)} characters)")
|
| 28 |
+
|
| 29 |
+
# Try to import the required library
|
| 30 |
+
try:
|
| 31 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
| 32 |
+
print("✅ langchain_huggingface library is available")
|
| 33 |
+
except ImportError as e:
|
| 34 |
+
print(f"❌ Failed to import langchain_huggingface: {e}")
|
| 35 |
+
print(" Install it with: pip install langchain-huggingface")
|
| 36 |
+
return False
|
| 37 |
+
|
| 38 |
+
# Map model names to their appropriate task types
|
| 39 |
+
TASK_MAPPING = {
|
| 40 |
+
"microsoft/DialoGPT-large": "conversational",
|
| 41 |
+
"HuggingFaceH4/zephyr-7b-beta": "conversational",
|
| 42 |
+
"google/flan-t5-xxl": "text2text-generation",
|
| 43 |
+
"google/flan-t5-xl": "text2text-generation",
|
| 44 |
+
"google/flan-ul2": "text2text-generation",
|
| 45 |
+
"bigscience/bloom": "text-generation",
|
| 46 |
+
"gpt2": "text-generation",
|
| 47 |
+
"mistralai/Mistral-7B-Instruct-v0.2": "text-generation",
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
# List of models to try with their expected task types
|
| 51 |
+
models_to_try = [
|
| 52 |
+
(os.getenv("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.2"), "text-generation"),
|
| 53 |
+
("microsoft/DialoGPT-large", "conversational"),
|
| 54 |
+
("google/flan-t5-xxl", "text2text-generation"),
|
| 55 |
+
("HuggingFaceH4/zephyr-7b-beta", "conversational")
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
for model_name, task_type in models_to_try:
|
| 59 |
+
print(f"🔍 Testing model initialization with: {model_name} (task: {task_type})")
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
llm = HuggingFaceEndpoint(
|
| 63 |
+
repo_id=model_name,
|
| 64 |
+
huggingfacehub_api_token=api_key,
|
| 65 |
+
task=task_type, # Specify the correct task type
|
| 66 |
+
temperature=0.1,
|
| 67 |
+
max_new_tokens=100
|
| 68 |
+
)
|
| 69 |
+
print("✅ Model initialized successfully")
|
| 70 |
+
|
| 71 |
+
# Test a simple prompt
|
| 72 |
+
print("🔍 Sending test request...")
|
| 73 |
+
# Use appropriate prompt format based on task type
|
| 74 |
+
if task_type == "conversational":
|
| 75 |
+
# For conversational models, we need to format the input as conversation
|
| 76 |
+
response = llm.invoke("Hello, how are you?")
|
| 77 |
+
else:
|
| 78 |
+
response = llm.invoke("Say 'Hello, FINESE SCHOOL!' in one word.")
|
| 79 |
+
print(f"✅ Test request successful")
|
| 80 |
+
print(f" Response: {response.strip()}")
|
| 81 |
+
return True
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"❌ Failed with model {model_name}: {str(e)}")
|
| 85 |
+
print(" Trying next model...\n")
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
print("❌ All models failed. Please check your API key and network connection.")
|
| 89 |
+
print("\n💡 Troubleshooting tips:")
|
| 90 |
+
print(" 1. Check that your API key is valid")
|
| 91 |
+
print(" 2. Verify you have internet connectivity")
|
| 92 |
+
print(" 3. Check if there are any firewall restrictions")
|
| 93 |
+
print(" 4. Make sure you haven't exceeded your rate limits")
|
| 94 |
+
return False
|
| 95 |
+
|
| 96 |
+
def main():
|
| 97 |
+
"""Main test function."""
|
| 98 |
+
print("🧪 FINESE SCHOOL Hugging Face API Test Script")
|
| 99 |
+
print("=" * 50)
|
| 100 |
+
|
| 101 |
+
success = test_huggingface_api()
|
| 102 |
+
|
| 103 |
+
print("\n📋 Summary")
|
| 104 |
+
print("=" * 50)
|
| 105 |
+
|
| 106 |
+
if success:
|
| 107 |
+
print("✅ Hugging Face API setup is working correctly!")
|
| 108 |
+
print("\n🚀 You can now run the main application:")
|
| 109 |
+
print(" streamlit run src/app.py")
|
| 110 |
+
else:
|
| 111 |
+
print("❌ Hugging Face API setup has issues.")
|
| 112 |
+
print(" Please check the error messages above and fix the issues.")
|
| 113 |
+
sys.exit(1)
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
main()
|
scripts/troubleshoot.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Troubleshoot common issues with the FINESE SCHOOL application.
|
| 3 |
+
|
| 4 |
+
This script helps diagnose common configuration issues and provides
|
| 5 |
+
suggestions for fixing them.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
# Load environment variables
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
def check_environment_variables():
|
| 16 |
+
"""Check if required environment variables are set."""
|
| 17 |
+
print("🔍 Checking environment variables...")
|
| 18 |
+
|
| 19 |
+
# Check API type
|
| 20 |
+
api_type = os.getenv("API_TYPE", "huggingface").lower()
|
| 21 |
+
print(f"✅ API_TYPE is set to: {api_type}")
|
| 22 |
+
|
| 23 |
+
# Check API key based on API type
|
| 24 |
+
if api_type == "google":
|
| 25 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 26 |
+
if not api_key:
|
| 27 |
+
print("❌ GOOGLE_API_KEY is not set")
|
| 28 |
+
print(" Please set your Google API key in the .env file")
|
| 29 |
+
return False
|
| 30 |
+
else:
|
| 31 |
+
print(f"✅ GOOGLE_API_KEY is set (length: {len(api_key)} characters)")
|
| 32 |
+
elif api_type == "openai":
|
| 33 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 34 |
+
if not api_key:
|
| 35 |
+
print("❌ OPENAI_API_KEY is not set")
|
| 36 |
+
print(" Please set your OpenAI API key in the .env file")
|
| 37 |
+
return False
|
| 38 |
+
else:
|
| 39 |
+
print(f"✅ OPENAI_API_KEY is set (length: {len(api_key)} characters)")
|
| 40 |
+
else: # huggingface
|
| 41 |
+
api_key = os.getenv("HUGGINGFACE_API_KEY")
|
| 42 |
+
if not api_key:
|
| 43 |
+
print("❌ HUGGINGFACE_API_KEY is not set")
|
| 44 |
+
print(" Please set your Hugging Face API key in the .env file")
|
| 45 |
+
return False
|
| 46 |
+
else:
|
| 47 |
+
print(f"✅ HUGGINGFACE_API_KEY is set (length: {len(api_key)} characters)")
|
| 48 |
+
|
| 49 |
+
model_name = os.getenv("MODEL_NAME")
|
| 50 |
+
if model_name:
|
| 51 |
+
print(f"✅ MODEL_NAME is set to: {model_name}")
|
| 52 |
+
else:
|
| 53 |
+
print("⚠️ MODEL_NAME is not set, using default model")
|
| 54 |
+
|
| 55 |
+
temperature = os.getenv("TEMPERATURE", "0.3")
|
| 56 |
+
print(f"✅ TEMPERATURE is set to: {temperature}")
|
| 57 |
+
|
| 58 |
+
max_tokens = os.getenv("MAX_TOKENS", "2048")
|
| 59 |
+
print(f"✅ MAX_TOKENS is set to: {max_tokens}")
|
| 60 |
+
|
| 61 |
+
is_docker = os.getenv("IS_DOCKER", "false")
|
| 62 |
+
print(f"✅ IS_DOCKER is set to: {is_docker}")
|
| 63 |
+
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
+
def check_dependencies():
|
| 67 |
+
"""Check if required dependencies are installed."""
|
| 68 |
+
print("\n🔍 Checking dependencies...")
|
| 69 |
+
|
| 70 |
+
dependencies = [
|
| 71 |
+
"streamlit",
|
| 72 |
+
"pdfkit",
|
| 73 |
+
"python-dotenv",
|
| 74 |
+
"langchain",
|
| 75 |
+
"pydantic",
|
| 76 |
+
"pygments"
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
# API-specific dependencies
|
| 80 |
+
api_type = os.getenv("API_TYPE", "huggingface").lower()
|
| 81 |
+
if api_type == "google":
|
| 82 |
+
dependencies.append("langchain-google-genai")
|
| 83 |
+
elif api_type == "openai":
|
| 84 |
+
dependencies.append("langchain-openai")
|
| 85 |
+
else: # huggingface
|
| 86 |
+
dependencies.append("langchain-huggingface")
|
| 87 |
+
|
| 88 |
+
missing_deps = []
|
| 89 |
+
for dep in dependencies:
|
| 90 |
+
try:
|
| 91 |
+
__import__(dep)
|
| 92 |
+
print(f"✅ {dep} is installed")
|
| 93 |
+
except ImportError as e:
|
| 94 |
+
print(f"❌ {dep} is missing: {e}")
|
| 95 |
+
missing_deps.append(dep)
|
| 96 |
+
|
| 97 |
+
if missing_deps:
|
| 98 |
+
print(f"\n🔧 To install missing dependencies, run:")
|
| 99 |
+
print(f" pip install {' '.join(missing_deps)}")
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
return True
|
| 103 |
+
|
| 104 |
+
def check_model_access():
|
| 105 |
+
"""Check if we can access the configured model."""
|
| 106 |
+
print("\n🔍 Checking model access...")
|
| 107 |
+
|
| 108 |
+
api_type = os.getenv("API_TYPE", "huggingface").lower()
|
| 109 |
+
|
| 110 |
+
if api_type == "google":
|
| 111 |
+
try:
|
| 112 |
+
from langchain_google_genai import GoogleGenerativeAI
|
| 113 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 114 |
+
if not api_key:
|
| 115 |
+
print("❌ Cannot check model access without GOOGLE_API_KEY")
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
model_name = os.getenv("MODEL_NAME", "gemini-1.5-flash")
|
| 119 |
+
print(f" Testing access to Google model: {model_name}")
|
| 120 |
+
|
| 121 |
+
llm = GoogleGenerativeAI(
|
| 122 |
+
model=model_name,
|
| 123 |
+
google_api_key=api_key
|
| 124 |
+
)
|
| 125 |
+
# Test a simple prompt
|
| 126 |
+
print(" Sending test request...")
|
| 127 |
+
response = llm.invoke("Say 'Hello, FINESE SCHOOL!' in one word.")
|
| 128 |
+
print(f"✅ Successfully connected to Google Generative AI")
|
| 129 |
+
print(f" Test response: {response.content.strip()}")
|
| 130 |
+
return True
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"❌ Failed to access Google model: {str(e)}")
|
| 133 |
+
print("\n💡 Troubleshooting tips:")
|
| 134 |
+
print(" 1. Check that your API key is valid")
|
| 135 |
+
print(" 2. Verify the model name is correct by running scripts/list_models.py")
|
| 136 |
+
print(" 3. Check your internet connection")
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
elif api_type == "openai":
|
| 140 |
+
try:
|
| 141 |
+
from langchain_openai import ChatOpenAI
|
| 142 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 143 |
+
if not api_key:
|
| 144 |
+
print("❌ Cannot check model access without OPENAI_API_KEY")
|
| 145 |
+
return False
|
| 146 |
+
|
| 147 |
+
model_name = os.getenv("MODEL_NAME", "gpt-3.5-turbo")
|
| 148 |
+
print(f" Testing access to OpenAI model: {model_name}")
|
| 149 |
+
|
| 150 |
+
llm = ChatOpenAI(
|
| 151 |
+
model_name=model_name,
|
| 152 |
+
openai_api_key=api_key
|
| 153 |
+
)
|
| 154 |
+
# Test a simple prompt
|
| 155 |
+
print(" Sending test request...")
|
| 156 |
+
response = llm.invoke("Say 'Hello, FINESE SCHOOL!' in one word.")
|
| 157 |
+
print(f"✅ Successfully connected to OpenAI")
|
| 158 |
+
print(f" Test response: {response.content.strip()}")
|
| 159 |
+
return True
|
| 160 |
+
except Exception as e:
|
| 161 |
+
print(f"❌ Failed to access OpenAI model: {str(e)}")
|
| 162 |
+
print("\n💡 Troubleshooting tips:")
|
| 163 |
+
print(" 1. Check that your API key is valid")
|
| 164 |
+
print(" 2. Verify the model name is correct")
|
| 165 |
+
print(" 3. Check your internet connection")
|
| 166 |
+
return False
|
| 167 |
+
|
| 168 |
+
else: # huggingface
|
| 169 |
+
try:
|
| 170 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
| 171 |
+
api_key = os.getenv("HUGGINGFACE_API_KEY")
|
| 172 |
+
if not api_key:
|
| 173 |
+
print("❌ Cannot check model access without HUGGINGFACE_API_KEY")
|
| 174 |
+
return False
|
| 175 |
+
|
| 176 |
+
model_name = os.getenv("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.2")
|
| 177 |
+
print(f" Testing access to Hugging Face model: {model_name}")
|
| 178 |
+
|
| 179 |
+
llm = HuggingFaceEndpoint(
|
| 180 |
+
repo_id=model_name,
|
| 181 |
+
huggingfacehub_api_token=api_key,
|
| 182 |
+
temperature=0.1,
|
| 183 |
+
max_new_tokens=100
|
| 184 |
+
)
|
| 185 |
+
# Test a simple prompt
|
| 186 |
+
print(" Sending test request...")
|
| 187 |
+
response = llm.invoke("Say 'Hello, FINESE SCHOOL!' in one word.")
|
| 188 |
+
print(f"✅ Successfully connected to Hugging Face Inference API")
|
| 189 |
+
print(f" Test response: {response.strip()}")
|
| 190 |
+
return True
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f"❌ Failed to access Hugging Face model: {str(e)}")
|
| 193 |
+
print("\n💡 Troubleshooting tips:")
|
| 194 |
+
print(" 1. Check that your API key is valid")
|
| 195 |
+
print(" 2. Verify the model name is correct")
|
| 196 |
+
print(" 3. Check your internet connection")
|
| 197 |
+
print(" 4. Make sure you haven't exceeded rate limits")
|
| 198 |
+
return False
|
| 199 |
+
|
| 200 |
+
def check_wkhtmltopdf():
|
| 201 |
+
"""Check if wkhtmltopdf is installed for PDF generation."""
|
| 202 |
+
print("\n🔍 Checking PDF generation support...")
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
import pdfkit
|
| 206 |
+
print("✅ pdfkit is installed")
|
| 207 |
+
except ImportError:
|
| 208 |
+
print("❌ pdfkit is not installed")
|
| 209 |
+
return False
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
# Try to configure wkhtmltopdf
|
| 213 |
+
config = pdfkit.configuration()
|
| 214 |
+
print("✅ wkhtmltopdf is configured")
|
| 215 |
+
return True
|
| 216 |
+
except OSError:
|
| 217 |
+
print("⚠️ wkhtmltopdf is not installed or not in PATH")
|
| 218 |
+
print(" PDF export functionality will be limited")
|
| 219 |
+
print("\n🔧 To install wkhtmltopdf:")
|
| 220 |
+
print(" Windows: Download from https://wkhtmltopdf.org/downloads.html")
|
| 221 |
+
print(" macOS: brew install --cask wkhtmltopdf")
|
| 222 |
+
print(" Linux: sudo apt-get install wkhtmltopdf")
|
| 223 |
+
return True # Not critical for basic functionality
|
| 224 |
+
|
| 225 |
+
def main():
|
| 226 |
+
"""Main troubleshooting function."""
|
| 227 |
+
print("🛠️ FINESE SCHOOL Troubleshooting Script")
|
| 228 |
+
print("=" * 50)
|
| 229 |
+
|
| 230 |
+
checks = [
|
| 231 |
+
check_environment_variables,
|
| 232 |
+
check_dependencies,
|
| 233 |
+
check_model_access,
|
| 234 |
+
check_wkhtmltopdf
|
| 235 |
+
]
|
| 236 |
+
|
| 237 |
+
results = []
|
| 238 |
+
for check in checks:
|
| 239 |
+
try:
|
| 240 |
+
results.append(check())
|
| 241 |
+
except Exception as e:
|
| 242 |
+
print(f"❌ Check failed with exception: {e}")
|
| 243 |
+
results.append(False)
|
| 244 |
+
|
| 245 |
+
print("\n📋 Summary")
|
| 246 |
+
print("=" * 50)
|
| 247 |
+
|
| 248 |
+
if all(results):
|
| 249 |
+
print("✅ All checks passed! You should be able to run FINESE SCHOOL.")
|
| 250 |
+
print("\n🚀 To start the application, run:")
|
| 251 |
+
print(" streamlit run src/app.py")
|
| 252 |
+
else:
|
| 253 |
+
passed = sum(results)
|
| 254 |
+
total = len(results)
|
| 255 |
+
print(f"⚠️ {passed}/{total} checks passed.")
|
| 256 |
+
if passed > 0:
|
| 257 |
+
print("✅ Some functionality may work, but fix the issues above for full functionality.")
|
| 258 |
+
else:
|
| 259 |
+
print("❌ Critical issues found. Please address them before running the application.")
|
| 260 |
+
print("\n📝 For more help, check the README.md file or open an issue on GitHub.")
|
| 261 |
+
|
| 262 |
+
if __name__ == "__main__":
|
| 263 |
+
main()
|
src/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ChatBox Pro: Data Science Mentor
|
| 3 |
+
An AI-powered assistant for data science professionals.
|
| 4 |
+
"""
|
src/app.py
ADDED
|
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Add the project root directory to sys.path
|
| 5 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 6 |
+
|
| 7 |
+
import streamlit as st
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
import re
|
| 10 |
+
from src.config import TOPIC_REGISTRY
|
| 11 |
+
from src.chat_engine import generate_structured_response
|
| 12 |
+
from src.pdf_export import export_chat_to_pdf
|
| 13 |
+
from src.utils import detect_language_from_context, sanitize_input
|
| 14 |
+
|
| 15 |
+
# Load environment variables
|
| 16 |
+
if os.getenv("IS_DOCKER") != "true":
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
def highlight_text(text):
|
| 20 |
+
"""Highlight important keywords in the text."""
|
| 21 |
+
keywords = ["important", "note", "remember", "key", "tip", "⚠️", "only", "strictly", "best practice", "crucial", "essential"]
|
| 22 |
+
sentences = text.split(". ")
|
| 23 |
+
highlighted_sentences = []
|
| 24 |
+
for sent in sentences:
|
| 25 |
+
if any(kw.lower() in sent.lower() for kw in keywords):
|
| 26 |
+
sent = f'<span style="background-color:#fff3cd; color:#856404; font-weight:bold;">{sent.strip()}.</span>'
|
| 27 |
+
else:
|
| 28 |
+
sent = sent.strip() + "." if sent.strip() else ""
|
| 29 |
+
highlighted_sentences.append(sent)
|
| 30 |
+
return ". ".join(filter(None, highlighted_sentences))
|
| 31 |
+
|
| 32 |
+
# Configure page
|
| 33 |
+
st.set_page_config(page_title="FINESE SCHOOL: Data Science Mentor", page_icon="🎓", layout="wide")
|
| 34 |
+
|
| 35 |
+
# Define provider key mapping
|
| 36 |
+
PROVIDER_KEY_MAPPING = {
|
| 37 |
+
"Google Gemini": "google",
|
| 38 |
+
"OpenAI": "openai",
|
| 39 |
+
"Hugging Face": "huggingface",
|
| 40 |
+
"Anthropic": "anthropic"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Initialize session state
|
| 44 |
+
if "chat_history" not in st.session_state:
|
| 45 |
+
st.session_state.chat_history = []
|
| 46 |
+
|
| 47 |
+
if "llm_provider" not in st.session_state:
|
| 48 |
+
st.session_state.llm_provider = "Google Gemini"
|
| 49 |
+
if "llm_api_key" not in st.session_state:
|
| 50 |
+
st.session_state.llm_api_key = ""
|
| 51 |
+
if "llm_model" not in st.session_state:
|
| 52 |
+
st.session_state.llm_model = ""
|
| 53 |
+
|
| 54 |
+
if "current_topic" not in st.session_state:
|
| 55 |
+
st.session_state.current_topic = list(TOPIC_REGISTRY.keys())[0] if TOPIC_REGISTRY else None
|
| 56 |
+
|
| 57 |
+
# Apply custom CSS
|
| 58 |
+
st.markdown("""
|
| 59 |
+
<style>
|
| 60 |
+
.diagnosis {
|
| 61 |
+
background-color: #fff8e1;
|
| 62 |
+
padding: 15px;
|
| 63 |
+
border-radius: 10px;
|
| 64 |
+
margin: 15px 0;
|
| 65 |
+
border-left: 5px solid #ffc107;
|
| 66 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
| 67 |
+
}
|
| 68 |
+
.tip {
|
| 69 |
+
background-color: #e8f5e9;
|
| 70 |
+
border-left: 5px solid #4caf50;
|
| 71 |
+
padding: 15px;
|
| 72 |
+
border-radius: 10px;
|
| 73 |
+
margin: 15px 0;
|
| 74 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
| 75 |
+
}
|
| 76 |
+
.refs {
|
| 77 |
+
background-color: #f3e5f5;
|
| 78 |
+
border-left: 5px solid #9c27b0;
|
| 79 |
+
padding: 15px;
|
| 80 |
+
border-radius: 10px;
|
| 81 |
+
margin: 15px 0;
|
| 82 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
| 83 |
+
}
|
| 84 |
+
.stButton>button {
|
| 85 |
+
border-radius: 10px;
|
| 86 |
+
}
|
| 87 |
+
.chat-message {
|
| 88 |
+
padding: 20px;
|
| 89 |
+
border-radius: 10px;
|
| 90 |
+
margin-bottom: 15px;
|
| 91 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
| 92 |
+
}
|
| 93 |
+
.user-message {
|
| 94 |
+
background-color: #e3f2fd;
|
| 95 |
+
border-left: 5px solid #2196f3;
|
| 96 |
+
}
|
| 97 |
+
.assistant-message {
|
| 98 |
+
background-color: #f5f5f5;
|
| 99 |
+
border-left: 5px solid #757575;
|
| 100 |
+
}
|
| 101 |
+
.highlight-keyword {
|
| 102 |
+
background-color: #fff3cd;
|
| 103 |
+
color: #856404;
|
| 104 |
+
font-weight: bold;
|
| 105 |
+
}
|
| 106 |
+
.topic-card {
|
| 107 |
+
border: 1px solid #e0e0e0;
|
| 108 |
+
border-radius: 10px;
|
| 109 |
+
padding: 15px;
|
| 110 |
+
margin-bottom: 15px;
|
| 111 |
+
background-color: #fafafa;
|
| 112 |
+
transition: transform 0.2s;
|
| 113 |
+
}
|
| 114 |
+
.topic-card:hover {
|
| 115 |
+
transform: translateY(-3px);
|
| 116 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
| 117 |
+
}
|
| 118 |
+
.topic-title {
|
| 119 |
+
font-weight: bold;
|
| 120 |
+
font-size: 1.1em;
|
| 121 |
+
margin-bottom: 5px;
|
| 122 |
+
}
|
| 123 |
+
.topic-description {
|
| 124 |
+
color: #666;
|
| 125 |
+
font-size: 0.9em;
|
| 126 |
+
}
|
| 127 |
+
.welcome-banner {
|
| 128 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 129 |
+
color: white;
|
| 130 |
+
padding: 25px;
|
| 131 |
+
border-radius: 15px;
|
| 132 |
+
margin-bottom: 25px;
|
| 133 |
+
text-align: center;
|
| 134 |
+
}
|
| 135 |
+
.stats-card {
|
| 136 |
+
background-color: #e3f2fd;
|
| 137 |
+
border-radius: 10px;
|
| 138 |
+
padding: 15px;
|
| 139 |
+
text-align: center;
|
| 140 |
+
margin-bottom: 15px;
|
| 141 |
+
}
|
| 142 |
+
.code-block {
|
| 143 |
+
background-color: #f8f9fa;
|
| 144 |
+
border-radius: 8px;
|
| 145 |
+
padding: 15px;
|
| 146 |
+
overflow-x: auto;
|
| 147 |
+
font-family: monospace;
|
| 148 |
+
font-size: 0.9em;
|
| 149 |
+
margin: 15px 0;
|
| 150 |
+
border: 1px solid #eee;
|
| 151 |
+
}
|
| 152 |
+
.on-topic-warning {
|
| 153 |
+
background-color: #ffebee;
|
| 154 |
+
border-left: 5px solid #f44336;
|
| 155 |
+
padding: 15px;
|
| 156 |
+
border-radius: 10px;
|
| 157 |
+
margin: 15px 0;
|
| 158 |
+
}
|
| 159 |
+
</style>
|
| 160 |
+
""", unsafe_allow_html=True)
|
| 161 |
+
|
| 162 |
+
# Header
|
| 163 |
+
st.markdown('<div class="welcome-banner"><h1>🎓 FINESE SCHOOL: Your 24/7 Data Mentor</h1><p>Get expert-level, topic-locked, code-rich answers with best practices</p></div>', unsafe_allow_html=True)
|
| 164 |
+
|
| 165 |
+
# Sidebar
|
| 166 |
+
with st.sidebar:
|
| 167 |
+
st.header("⚙️ Settings & Controls")
|
| 168 |
+
|
| 169 |
+
# Theme selector
|
| 170 |
+
theme = st.selectbox("🎨 Theme", ["Light", "Dark"])
|
| 171 |
+
if theme == "Dark":
|
| 172 |
+
st.markdown("""
|
| 173 |
+
<style>
|
| 174 |
+
.stApp {
|
| 175 |
+
background-color: #0e1117;
|
| 176 |
+
color: white;
|
| 177 |
+
}
|
| 178 |
+
.stMarkdown, .stText {
|
| 179 |
+
color: white;
|
| 180 |
+
}
|
| 181 |
+
.topic-card {
|
| 182 |
+
background-color: #262730;
|
| 183 |
+
color: white;
|
| 184 |
+
}
|
| 185 |
+
.topic-description {
|
| 186 |
+
color: #ccc;
|
| 187 |
+
}
|
| 188 |
+
</style>
|
| 189 |
+
""", unsafe_allow_html=True)
|
| 190 |
+
|
| 191 |
+
st.divider()
|
| 192 |
+
st.subheader("🤖 LLM Provider")
|
| 193 |
+
llm_provider = st.selectbox(
|
| 194 |
+
"Select LLM Provider",
|
| 195 |
+
["Google Gemini", "OpenAI", "Hugging Face", "Anthropic", "None"],
|
| 196 |
+
index=0,
|
| 197 |
+
key="llm_provider"
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
provider_key = PROVIDER_KEY_MAPPING.get(llm_provider, "")
|
| 201 |
+
if llm_provider != "None" and provider_key:
|
| 202 |
+
api_key = st.text_input(
|
| 203 |
+
f"{llm_provider} API Key",
|
| 204 |
+
type="password",
|
| 205 |
+
key=f"{provider_key}_api_key",
|
| 206 |
+
help="Enter your API key for the selected provider"
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
# Define provider-specific model options
|
| 210 |
+
PROVIDER_MODELS = {
|
| 211 |
+
"Google Gemini": [
|
| 212 |
+
"gemini-1.5-flash", "gemini-1.5-pro", "gemini-1.5-advanced",
|
| 213 |
+
"gemini-1.0-pro", "gemini-1.5-ultra"
|
| 214 |
+
],
|
| 215 |
+
"OpenAI": [
|
| 216 |
+
"gpt-4o", "gpt-4-turbo", "gpt-3.5-turbo",
|
| 217 |
+
"gpt-4", "gpt-4-32k"
|
| 218 |
+
],
|
| 219 |
+
"Hugging Face": [
|
| 220 |
+
"mistralai/Mistral-7B-Instruct-v0.2", "meta-llama/Llama-3-8b-chat-hf",
|
| 221 |
+
"google/flan-t5-xxl", "HuggingFaceH4/zephyr-7b-beta"
|
| 222 |
+
],
|
| 223 |
+
"Anthropic": [
|
| 224 |
+
"claude-3-5-sonnet-20240620", "claude-3-opus-20240229",
|
| 225 |
+
"claude-3-haiku-20240307", "claude-2.1"
|
| 226 |
+
]
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
# Get models for selected provider
|
| 230 |
+
model_options = PROVIDER_MODELS.get(llm_provider, [])
|
| 231 |
+
model_options.append("Custom Model")
|
| 232 |
+
|
| 233 |
+
# Use the extracted model options in the selectbox
|
| 234 |
+
model_name = st.selectbox(
|
| 235 |
+
"Model Name",
|
| 236 |
+
options=model_options,
|
| 237 |
+
key=f"{provider_key}_model",
|
| 238 |
+
help="Select a model name or choose 'Custom Model' to enter your own"
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# Simplify the custom model input logic
|
| 242 |
+
if model_name == "Custom Model":
|
| 243 |
+
custom_model_name = st.text_input(
|
| 244 |
+
"Enter a custom model name",
|
| 245 |
+
placeholder="Type your model name here...",
|
| 246 |
+
key=f"{provider_key}_custom_model"
|
| 247 |
+
)
|
| 248 |
+
if not custom_model_name.strip():
|
| 249 |
+
st.error("Custom model name cannot be empty.")
|
| 250 |
+
else:
|
| 251 |
+
custom_model_name = None
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# Stats
|
| 255 |
+
st.divider()
|
| 256 |
+
st.subheader("📊 Session Stats")
|
| 257 |
+
st.markdown(f'<div class="stats-card"><h3>{len(st.session_state.chat_history)//2}</h3><p>Questions Asked</p></div>', unsafe_allow_html=True)
|
| 258 |
+
|
| 259 |
+
# Topic information
|
| 260 |
+
st.divider()
|
| 261 |
+
st.subheader("📘 Topics")
|
| 262 |
+
for topic_key, topic_spec in TOPIC_REGISTRY.items():
|
| 263 |
+
with st.expander(topic_key):
|
| 264 |
+
st.markdown(f"""
|
| 265 |
+
<div class="topic-card">
|
| 266 |
+
<div class="topic-title">{topic_spec.name}</div>
|
| 267 |
+
<div class="topic-description">{topic_spec.description}</div>
|
| 268 |
+
<div style="margin-top: 10px;">
|
| 269 |
+
<strong>Domain:</strong> {topic_spec.domain}<br>
|
| 270 |
+
<strong>Allowed Libraries:</strong> {', '.join(topic_spec.allowed_libraries) or 'None'}<br>
|
| 271 |
+
<strong>Banned Topics:</strong> {', '.join(topic_spec.banned_topics) or 'None'}
|
| 272 |
+
</div>
|
| 273 |
+
</div>
|
| 274 |
+
""", unsafe_allow_html=True)
|
| 275 |
+
|
| 276 |
+
# Conversation history controls
|
| 277 |
+
st.divider()
|
| 278 |
+
st.subheader("🗂️ Conversation")
|
| 279 |
+
col1, col2 = st.columns(2)
|
| 280 |
+
with col1:
|
| 281 |
+
if st.button("🗑️ Clear History", use_container_width=True):
|
| 282 |
+
st.session_state.chat_history = []
|
| 283 |
+
st.success("History cleared!")
|
| 284 |
+
st.rerun()
|
| 285 |
+
|
| 286 |
+
with col2:
|
| 287 |
+
if st.button("📥 Export to PDF", use_container_width=True):
|
| 288 |
+
if st.session_state.chat_history:
|
| 289 |
+
try:
|
| 290 |
+
with st.spinner("Generating PDF..."):
|
| 291 |
+
pdf_bytes = export_chat_to_pdf(st.session_state.chat_history)
|
| 292 |
+
st.download_button(
|
| 293 |
+
"✅ Download PDF",
|
| 294 |
+
pdf_bytes,
|
| 295 |
+
"data_mentor_session.pdf",
|
| 296 |
+
"application/pdf",
|
| 297 |
+
use_container_width=True
|
| 298 |
+
)
|
| 299 |
+
except Exception as e:
|
| 300 |
+
st.error(f"PDF generation failed: {str(e)}")
|
| 301 |
+
st.info("Please try again or contact support if the issue persists.")
|
| 302 |
+
else:
|
| 303 |
+
st.warning("No conversation to export")
|
| 304 |
+
|
| 305 |
+
# Info
|
| 306 |
+
st.divider()
|
| 307 |
+
st.subheader("ℹ️ About")
|
| 308 |
+
st.info("FINESE SCHOOL provides expert-level answers on data science topics with code examples and best practices.")
|
| 309 |
+
|
| 310 |
+
# API Key validation - MOVED AFTER SIDEBAR
|
| 311 |
+
current_provider = st.session_state.llm_provider
|
| 312 |
+
if current_provider != "None":
|
| 313 |
+
provider_key = PROVIDER_KEY_MAPPING.get(current_provider, "")
|
| 314 |
+
if provider_key:
|
| 315 |
+
api_key = st.session_state.get(f"{provider_key}_api_key", "")
|
| 316 |
+
if not api_key:
|
| 317 |
+
st.error(f"⚠️ {current_provider} API key not found. Please enter your API key in the sidebar.")
|
| 318 |
+
st.stop()
|
| 319 |
+
|
| 320 |
+
# Main interface
|
| 321 |
+
col1, col2 = st.columns([1, 2])
|
| 322 |
+
|
| 323 |
+
with col1:
|
| 324 |
+
st.header("🎯 Select Topic")
|
| 325 |
+
topic_keys = list(TOPIC_REGISTRY.keys())
|
| 326 |
+
selected_topic = st.selectbox("Choose your domain", topic_keys, index=topic_keys.index(st.session_state.current_topic) if st.session_state.current_topic in topic_keys else 0)
|
| 327 |
+
st.session_state.current_topic = selected_topic
|
| 328 |
+
|
| 329 |
+
topic_spec = TOPIC_REGISTRY[selected_topic]
|
| 330 |
+
st.markdown(f"""
|
| 331 |
+
<div class="topic-card">
|
| 332 |
+
<div class="topic-title">Current Topic: {topic_spec.name}</div>
|
| 333 |
+
<div class="topic-description">{topic_spec.description}</div>
|
| 334 |
+
<div style="margin-top: 10px;">
|
| 335 |
+
<strong>Style Guide:</strong> {topic_spec.style_guide}
|
| 336 |
+
</div>
|
| 337 |
+
</div>
|
| 338 |
+
""", unsafe_allow_html=True)
|
| 339 |
+
|
| 340 |
+
with col2:
|
| 341 |
+
st.header("❓ Ask a Question")
|
| 342 |
+
user_q = st.text_area("Enter your precise question", height=120, placeholder=f"Ask anything about {selected_topic}...")
|
| 343 |
+
|
| 344 |
+
col_btn1, col_btn2 = st.columns(2)
|
| 345 |
+
with col_btn1:
|
| 346 |
+
submit = st.button("🧠 Get Expert Answer", type="primary", use_container_width=True)
|
| 347 |
+
with col_btn2:
|
| 348 |
+
clear = st.button("🗑️ Clear Chat", use_container_width=True)
|
| 349 |
+
|
| 350 |
+
# Process user query
|
| 351 |
+
if submit and user_q.strip():
|
| 352 |
+
# Sanitize input
|
| 353 |
+
sanitized_question = sanitize_input(user_q.strip())
|
| 354 |
+
|
| 355 |
+
if len(sanitized_question) < 10:
|
| 356 |
+
st.warning("Please enter a more detailed question (at least 10 characters).")
|
| 357 |
+
else:
|
| 358 |
+
try:
|
| 359 |
+
with st.spinner("Dr. Data is analyzing your question..."):
|
| 360 |
+
# Add user question to chat
|
| 361 |
+
st.session_state.chat_history.append(("🧑🎓 You", sanitized_question))
|
| 362 |
+
|
| 363 |
+
# Generate response
|
| 364 |
+
response = generate_structured_response(selected_topic, sanitized_question)
|
| 365 |
+
|
| 366 |
+
if not response.is_on_topic:
|
| 367 |
+
msg = f'<div class="on-topic-warning"><strong>⚠️ Off-topic Question</strong><br>{response.answer}</div>'
|
| 368 |
+
st.session_state.chat_history.append(("🤖 Dr. Data", msg))
|
| 369 |
+
else:
|
| 370 |
+
# Build rich response
|
| 371 |
+
parts = []
|
| 372 |
+
if response.diagnosis:
|
| 373 |
+
parts.append(f'<div class="diagnosis"><strong>🔍 Diagnosis:</strong> {response.diagnosis}</div>')
|
| 374 |
+
parts.append(f'<div class="answer">{response.answer}</div>')
|
| 375 |
+
if response.code_example:
|
| 376 |
+
lang = detect_language_from_context(sanitized_question, selected_topic)
|
| 377 |
+
parts.append(f'<div class="code-block">{response.code_example}</div>')
|
| 378 |
+
if response.best_practice_tip:
|
| 379 |
+
parts.append(f'<div class="tip"><strong>💡 Best Practice:</strong> {response.best_practice_tip}</div>')
|
| 380 |
+
if response.references:
|
| 381 |
+
refs = "<br>".join(f"• <a href='{r}' target='_blank'>{r}</a>" for r in response.references)
|
| 382 |
+
parts.append(f'<div class="refs"><strong>📚 References:</strong><br>{refs}</div>')
|
| 383 |
+
|
| 384 |
+
full_response = "".join(parts)
|
| 385 |
+
# Apply highlighting to the response
|
| 386 |
+
highlighted_response = highlight_text(full_response)
|
| 387 |
+
st.session_state.chat_history.append(("🤖 Dr. Data", highlighted_response))
|
| 388 |
+
|
| 389 |
+
st.rerun()
|
| 390 |
+
except Exception as e:
|
| 391 |
+
st.error(f"❌ Tutor error: {str(e)}")
|
| 392 |
+
# Add error to chat for context
|
| 393 |
+
st.session_state.chat_history.append(("🤖 Dr. Data", f"❌ Sorry, I encountered an error: {str(e)}"))
|
| 394 |
+
|
| 395 |
+
# Clear chat
|
| 396 |
+
if clear:
|
| 397 |
+
st.session_state.chat_history = []
|
| 398 |
+
st.success("Chat cleared!")
|
| 399 |
+
st.rerun()
|
| 400 |
+
|
| 401 |
+
# Render chat with markdown + HTML
|
| 402 |
+
st.divider()
|
| 403 |
+
st.header("💬 Conversation")
|
| 404 |
+
|
| 405 |
+
# Limit conversation history for performance
|
| 406 |
+
MAX_HISTORY = 50
|
| 407 |
+
if len(st.session_state.chat_history) > MAX_HISTORY * 2:
|
| 408 |
+
st.session_state.chat_history = st.session_state.chat_history[-MAX_HISTORY * 2:]
|
| 409 |
+
|
| 410 |
+
# Display messages
|
| 411 |
+
if st.session_state.chat_history:
|
| 412 |
+
for sender, content in st.session_state.chat_history:
|
| 413 |
+
is_user = "You" in sender
|
| 414 |
+
message_class = "user-message" if is_user else "assistant-message"
|
| 415 |
+
|
| 416 |
+
with st.container():
|
| 417 |
+
if is_user:
|
| 418 |
+
st.markdown(
|
| 419 |
+
f"""
|
| 420 |
+
<div class="chat-message {message_class}">
|
| 421 |
+
<strong>{sender}</strong>
|
| 422 |
+
<div style="margin-top: 10px;">{content}</div>
|
| 423 |
+
</div>
|
| 424 |
+
""",
|
| 425 |
+
unsafe_allow_html=True
|
| 426 |
+
)
|
| 427 |
+
else:
|
| 428 |
+
# Assistant message with enhanced styling
|
| 429 |
+
st.markdown(
|
| 430 |
+
f"""
|
| 431 |
+
<div class="chat-message {message_class}">
|
| 432 |
+
<strong>{sender}</strong>
|
| 433 |
+
<div style="margin-top: 10px;">{content}</div>
|
| 434 |
+
</div>
|
| 435 |
+
""",
|
| 436 |
+
unsafe_allow_html=True
|
| 437 |
+
)
|
| 438 |
+
else:
|
| 439 |
+
st.info("👋 Welcome! Select a topic and ask your first question to get started.")
|
src/chat_engine.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
| 4 |
+
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate
|
| 5 |
+
from src.config import TOPIC_REGISTRY, MODEL_NAME, TEMPERATURE, MAX_TOKENS
|
| 6 |
+
from src.models import TutorResponse
|
| 7 |
+
|
| 8 |
+
# Conditional imports based on available API
|
| 9 |
+
try:
|
| 10 |
+
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI
|
| 11 |
+
GOOGLE_API_AVAILABLE = True
|
| 12 |
+
except ImportError:
|
| 13 |
+
GOOGLE_API_AVAILABLE = False
|
| 14 |
+
logging.warning("Google Generative AI library not available")
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
| 18 |
+
HUGGINGFACE_API_AVAILABLE = True
|
| 19 |
+
except ImportError:
|
| 20 |
+
HUGGINGFACE_API_AVAILABLE = False
|
| 21 |
+
logging.warning("HuggingFace library not available")
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from langchain_openai import ChatOpenAI
|
| 25 |
+
OPENAI_API_AVAILABLE = True
|
| 26 |
+
except ImportError:
|
| 27 |
+
OPENAI_API_AVAILABLE = False
|
| 28 |
+
logging.warning("OpenAI library not available")
|
| 29 |
+
|
| 30 |
+
# Set up logging
|
| 31 |
+
logging.basicConfig(level=logging.INFO)
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
def get_llm():
|
| 35 |
+
# Determine which API to use based on environment variables
|
| 36 |
+
api_type = os.getenv("API_TYPE", "huggingface").lower()
|
| 37 |
+
|
| 38 |
+
if api_type == "google" and GOOGLE_API_AVAILABLE:
|
| 39 |
+
return get_google_llm()
|
| 40 |
+
elif api_type == "openai" and OPENAI_API_AVAILABLE:
|
| 41 |
+
return get_openai_llm()
|
| 42 |
+
elif api_type == "huggingface" and HUGGINGFACE_API_AVAILABLE:
|
| 43 |
+
return get_huggingface_llm()
|
| 44 |
+
else:
|
| 45 |
+
# Fallback to HuggingFace if preferred option is not available
|
| 46 |
+
if HUGGINGFACE_API_AVAILABLE:
|
| 47 |
+
return get_huggingface_llm()
|
| 48 |
+
elif GOOGLE_API_AVAILABLE:
|
| 49 |
+
return get_google_llm()
|
| 50 |
+
elif OPENAI_API_AVAILABLE:
|
| 51 |
+
return get_openai_llm()
|
| 52 |
+
else:
|
| 53 |
+
raise RuntimeError("No suitable LLM API available. Please install one of: langchain-google-genai, langchain-huggingface, langchain-openai")
|
| 54 |
+
|
| 55 |
+
def get_google_llm():
|
| 56 |
+
key = os.getenv("GOOGLE_API_KEY")
|
| 57 |
+
if not key:
|
| 58 |
+
raise RuntimeError("GOOGLE_API_KEY is required for Google API")
|
| 59 |
+
|
| 60 |
+
# Ensure model name is set with fallback to a more current default
|
| 61 |
+
model_name = MODEL_NAME if MODEL_NAME else "gemini-1.5-flash"
|
| 62 |
+
|
| 63 |
+
logger.info(f"Initializing Google LLM with model: {model_name}")
|
| 64 |
+
|
| 65 |
+
return ChatGoogleGenerativeAI(
|
| 66 |
+
model=model_name,
|
| 67 |
+
temperature=TEMPERATURE,
|
| 68 |
+
max_tokens=MAX_TOKENS,
|
| 69 |
+
google_api_key=key,
|
| 70 |
+
convert_system_message_to_human=True # Required for Gemini in LangChain
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
def get_openai_llm():
|
| 74 |
+
key = os.getenv("OPENAI_API_KEY")
|
| 75 |
+
if not key:
|
| 76 |
+
raise RuntimeError("OPENAI_API_KEY is required for OpenAI API")
|
| 77 |
+
|
| 78 |
+
# Ensure model name is set
|
| 79 |
+
model_name = MODEL_NAME if MODEL_NAME else "gpt-3.5-turbo"
|
| 80 |
+
|
| 81 |
+
logger.info(f"Initializing OpenAI LLM with model: {model_name}")
|
| 82 |
+
|
| 83 |
+
return ChatOpenAI(
|
| 84 |
+
model_name=model_name,
|
| 85 |
+
temperature=TEMPERATURE,
|
| 86 |
+
max_tokens=MAX_TOKENS,
|
| 87 |
+
openai_api_key=key
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# ... existing code ...
|
| 91 |
+
def get_huggingface_llm():
|
| 92 |
+
key = os.getenv("HUGGINGFACE_API_KEY")
|
| 93 |
+
|
| 94 |
+
# Check if API key is provided
|
| 95 |
+
if not key:
|
| 96 |
+
raise RuntimeError("HUGGINGFACE_API_KEY is required for Hugging Face API. Please set your API key.")
|
| 97 |
+
|
| 98 |
+
# Default to a good open-source model if none specified
|
| 99 |
+
model_name = MODEL_NAME if MODEL_NAME else "mistralai/Mistral-7B-Instruct-v0.2"
|
| 100 |
+
|
| 101 |
+
logger.info(f"Initializing HuggingFace LLM with model: {model_name}")
|
| 102 |
+
|
| 103 |
+
# Determine appropriate task based on model
|
| 104 |
+
task = "text-generation"
|
| 105 |
+
if "zephyr" in model_name.lower() or "dialo" in model_name.lower() or "mistral" in model_name.lower():
|
| 106 |
+
task = "conversational"
|
| 107 |
+
elif "flan" in model_name.lower():
|
| 108 |
+
task = "text2text-generation"
|
| 109 |
+
elif "t5" in model_name.lower():
|
| 110 |
+
task = "text2text-generation"
|
| 111 |
+
|
| 112 |
+
# Try to initialize the HuggingFace endpoint
|
| 113 |
+
try:
|
| 114 |
+
return HuggingFaceEndpoint(
|
| 115 |
+
repo_id=model_name,
|
| 116 |
+
huggingfacehub_api_token=key,
|
| 117 |
+
task=task,
|
| 118 |
+
temperature=TEMPERATURE,
|
| 119 |
+
max_new_tokens=MAX_TOKENS,
|
| 120 |
+
|
| 121 |
+
)
|
| 122 |
+
except Exception as e:
|
| 123 |
+
raise RuntimeError(f"Failed to initialize Hugging Face model {model_name}: {str(e)}")
|
| 124 |
+
|
| 125 |
+
def validate_model_availability(model_name: str, api_key: str):
|
| 126 |
+
"""
|
| 127 |
+
Validate if the specified model is available for the given API key.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
model_name: Name of the model to check
|
| 131 |
+
api_key: API key
|
| 132 |
+
|
| 133 |
+
Raises:
|
| 134 |
+
RuntimeError: If the model is not available
|
| 135 |
+
"""
|
| 136 |
+
# Simplified validation approach
|
| 137 |
+
logger.warning("Model validation is not implemented for all providers. Proceeding with initialization.")
|
| 138 |
+
pass
|
| 139 |
+
|
| 140 |
+
def build_expert_prompt(topic_spec, user_question: str) -> ChatPromptTemplate:
|
| 141 |
+
parser = PydanticOutputParser(pydantic_object=TutorResponse)
|
| 142 |
+
|
| 143 |
+
system_message = f"""
|
| 144 |
+
You are Dr. Data, a world-class data science educator with PhDs in CS and Statistics.
|
| 145 |
+
You are tutoring a professional on: **{topic_spec.name}**
|
| 146 |
+
|
| 147 |
+
Context:
|
| 148 |
+
- Allowed libraries: {', '.join(topic_spec.allowed_libraries) or 'None'}
|
| 149 |
+
- Avoid: {', '.join(topic_spec.banned_topics) or 'Nothing'}
|
| 150 |
+
- Style: {topic_spec.style_guide}
|
| 151 |
+
|
| 152 |
+
Rules:
|
| 153 |
+
1. If the question is off-topic (e.g., about web dev in a Pandas session), set is_on_topic=False and give a polite redirect.
|
| 154 |
+
2. Always attempt diagnosis: what might the user be confused about?
|
| 155 |
+
3. Code must be minimal, correct, and include necessary imports.
|
| 156 |
+
4. Cite official documentation when possible.
|
| 157 |
+
5. NEVER hallucinate package functions.
|
| 158 |
+
6. Output ONLY in the requested JSON format.
|
| 159 |
+
|
| 160 |
+
{{format_instructions}}
|
| 161 |
+
|
| 162 |
+
"""
|
| 163 |
+
|
| 164 |
+
return ChatPromptTemplate.from_messages([
|
| 165 |
+
SystemMessagePromptTemplate.from_template(system_message),
|
| 166 |
+
("human", "Question: {question}")
|
| 167 |
+
])
|
| 168 |
+
|
| 169 |
+
def generate_structured_response(topic_key: str, user_question: str) -> TutorResponse:
|
| 170 |
+
try:
|
| 171 |
+
llm = get_llm()
|
| 172 |
+
except Exception as e:
|
| 173 |
+
raise RuntimeError(f"Failed to initialize LLM: {str(e)}")
|
| 174 |
+
|
| 175 |
+
topic_spec = TOPIC_REGISTRY[topic_key]
|
| 176 |
+
|
| 177 |
+
# Create parser
|
| 178 |
+
parser = PydanticOutputParser(pydantic_object=TutorResponse)
|
| 179 |
+
|
| 180 |
+
# Build prompt with proper variable names
|
| 181 |
+
prompt = build_expert_prompt(topic_spec, user_question)
|
| 182 |
+
|
| 183 |
+
# Create the chain with proper variable binding
|
| 184 |
+
chain = prompt.partial(format_instructions=parser.get_format_instructions()) | llm
|
| 185 |
+
|
| 186 |
+
# Invoke with the question
|
| 187 |
+
try:
|
| 188 |
+
raw_output = chain.invoke({"question": user_question})
|
| 189 |
+
logger.info(f"Raw LLM output: {raw_output.content[:200]}...")
|
| 190 |
+
except Exception as e:
|
| 191 |
+
error_msg = str(e).lower()
|
| 192 |
+
if "401" in error_msg or "unauthorized" in error_msg:
|
| 193 |
+
detailed_msg = "API key is invalid or expired. Please check your API key in the sidebar settings."
|
| 194 |
+
elif "429" in error_msg or "rate limit" in error_msg:
|
| 195 |
+
detailed_msg = "Rate limit exceeded. Please wait a few minutes or check your API plan limits."
|
| 196 |
+
elif "connection" in error_msg or "timeout" in error_msg:
|
| 197 |
+
detailed_msg = "Network connection issue. Please check your internet connection and try again."
|
| 198 |
+
elif "model" in error_msg and "not found" in error_msg:
|
| 199 |
+
detailed_msg = f"Model '{MODEL_NAME}' not available. Please select a valid model from the dropdown or check spelling."
|
| 200 |
+
else:
|
| 201 |
+
detailed_msg = f"Unexpected error: {str(e)}. Please check your model configuration."
|
| 202 |
+
raise RuntimeError(f"Failed to get response from LLM: {detailed_msg}")
|
| 203 |
+
|
| 204 |
+
# Parse and validate
|
| 205 |
+
try:
|
| 206 |
+
response = parser.parse(raw_output.content)
|
| 207 |
+
except Exception as e:
|
| 208 |
+
# Try to extract JSON from the response if parsing fails
|
| 209 |
+
import re
|
| 210 |
+
import json
|
| 211 |
+
|
| 212 |
+
# Look for JSON in the response
|
| 213 |
+
json_match = re.search(r'\{.*\}', raw_output.content, re.DOTALL)
|
| 214 |
+
if json_match:
|
| 215 |
+
try:
|
| 216 |
+
json_str = json_match.group(0)
|
| 217 |
+
# Fix common JSON issues
|
| 218 |
+
json_str = json_str.replace('\n', '').replace('\t', '')
|
| 219 |
+
# Parse and reconstruct response
|
| 220 |
+
json_data = json.loads(json_str)
|
| 221 |
+
response = TutorResponse(**json_data)
|
| 222 |
+
except Exception as json_e:
|
| 223 |
+
raise ValueError(f"Failed to parse LLM output as JSON: {json_e}\nOriginal error: {e}\nRaw: {raw_output.content[:500]}...")
|
| 224 |
+
else:
|
| 225 |
+
# Fallback: retry with stricter prompt or return error
|
| 226 |
+
raise ValueError(f"Failed to parse LLM output: {e}\nRaw: {raw_output.content[:500]}...")
|
| 227 |
+
|
| 228 |
+
return response
|
src/config.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Dict, List, Literal
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
|
| 5 |
+
class TopicSpec(BaseModel):
|
| 6 |
+
name: str
|
| 7 |
+
description: str
|
| 8 |
+
domain: Literal["programming", "analysis", "visualization", "bi", "ml", "dl"]
|
| 9 |
+
allowed_libraries: List[str]
|
| 10 |
+
banned_topics: List[str] # e.g., web dev, mobile
|
| 11 |
+
style_guide: str
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
TOPIC_REGISTRY = {
|
| 15 |
+
"Python": TopicSpec(
|
| 16 |
+
name="Python",
|
| 17 |
+
description="Core Python: data structures, functions, decorators, context managers, type hints, performance.",
|
| 18 |
+
domain="programming",
|
| 19 |
+
allowed_libraries=["builtins", "collections", "itertools", "functools", "pathlib", "json"],
|
| 20 |
+
banned_topics=["Django", "Flask", "GUI", "web scraping", "APIs"],
|
| 21 |
+
style_guide="Be concise. Prefer standard library. Use type hints. Show 1-2 line examples unless complex."
|
| 22 |
+
),
|
| 23 |
+
"Data Analysis with Pandas & NumPy": TopicSpec(
|
| 24 |
+
name="Data Analysis with Pandas & NumPy",
|
| 25 |
+
description="Data wrangling, vectorization, time series, memory optimization.",
|
| 26 |
+
domain="analysis",
|
| 27 |
+
allowed_libraries=["pandas", "numpy", "polars"],
|
| 28 |
+
banned_topics=["web", "streaming", "big data frameworks"],
|
| 29 |
+
style_guide="Always show DataFrame/Series input and output. Use .head() in examples. Avoid chained indexing."
|
| 30 |
+
),
|
| 31 |
+
"SQL": TopicSpec(
|
| 32 |
+
name="SQL",
|
| 33 |
+
description="ANSI SQL with focus on PostgreSQL/SQLite. Window functions, CTEs, optimization.",
|
| 34 |
+
domain="analysis",
|
| 35 |
+
allowed_libraries=[],
|
| 36 |
+
banned_topics=["ORM", "NoSQL", "MongoDB"],
|
| 37 |
+
style_guide="Use explicit JOINs. Prefer CTEs over subqueries. Comment on performance implications."
|
| 38 |
+
),
|
| 39 |
+
"Power BI": TopicSpec(
|
| 40 |
+
name="Power BI",
|
| 41 |
+
description="DAX formulas, data modeling, relationships, performance tuning.",
|
| 42 |
+
domain="bi",
|
| 43 |
+
allowed_libraries=[],
|
| 44 |
+
banned_topics=["Tableau", "Looker", "Python scripts in PBI"],
|
| 45 |
+
style_guide="Explain DAX logic step-by-step. Use VAR for readability. Warn about context transition gotchas."
|
| 46 |
+
),
|
| 47 |
+
"Machine Learning": TopicSpec(
|
| 48 |
+
name="Machine Learning",
|
| 49 |
+
description="Scikit-learn, model evaluation, feature engineering, interpretability.",
|
| 50 |
+
domain="ml",
|
| 51 |
+
allowed_libraries=["sklearn", "xgboost", "lightgbm", "shap", "eli5"],
|
| 52 |
+
banned_topics=["LLMs", "neural nets", "PyTorch/TensorFlow"],
|
| 53 |
+
style_guide="Use pipelines. Show cross-validation. Emphasize data leakage prevention."
|
| 54 |
+
),
|
| 55 |
+
"Deep Learning": TopicSpec(
|
| 56 |
+
name="Deep Learning",
|
| 57 |
+
description="Neural networks with TensorFlow/PyTorch: CNNs, RNNs, transformers basics.",
|
| 58 |
+
domain="dl",
|
| 59 |
+
allowed_libraries=["torch", "tensorflow", "keras", "transformers"],
|
| 60 |
+
banned_topics=["web deployment", "mobile"],
|
| 61 |
+
style_guide="Use high-level APIs (e.g., tf.keras). Show model.summary(). Include input shape."
|
| 62 |
+
),
|
| 63 |
+
"Data Visualization": TopicSpec(
|
| 64 |
+
name="Data Visualization",
|
| 65 |
+
description="Effective static & interactive plots for insight communication.",
|
| 66 |
+
domain="visualization",
|
| 67 |
+
allowed_libraries=["matplotlib", "seaborn", "plotly", "altair"],
|
| 68 |
+
banned_topics=["D3.js", "web dashboards beyond Plotly"],
|
| 69 |
+
style_guide="Explain design choices (color, scale). Prefer Plotly for interactivity. Avoid pie charts."
|
| 70 |
+
),
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
# Add validation for model configuration
|
| 74 |
+
# Default to a more current and widely available model based on API type
|
| 75 |
+
API_TYPE = os.getenv("API_TYPE", "huggingface").lower()
|
| 76 |
+
|
| 77 |
+
if API_TYPE == "google":
|
| 78 |
+
DEFAULT_MODEL = "gemini-1.5-flash"
|
| 79 |
+
elif API_TYPE == "openai":
|
| 80 |
+
DEFAULT_MODEL = "gpt-3.5-turbo"
|
| 81 |
+
else: # huggingface
|
| 82 |
+
DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
|
| 83 |
+
|
| 84 |
+
MODEL_NAME = os.getenv("MODEL_NAME", DEFAULT_MODEL)
|
| 85 |
+
|
| 86 |
+
# Ensure that the model name is valid
|
| 87 |
+
if not MODEL_NAME:
|
| 88 |
+
MODEL_NAME = DEFAULT_MODEL
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.3"))
|
| 92 |
+
except ValueError:
|
| 93 |
+
TEMPERATURE = 0.3
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "2048"))
|
| 97 |
+
except ValueError:
|
| 98 |
+
MAX_TOKENS = 2048
|
| 99 |
+
|
| 100 |
+
# Validate temperature range
|
| 101 |
+
if TEMPERATURE < 0 or TEMPERATURE > 1:
|
| 102 |
+
TEMPERATURE = 0.3
|
| 103 |
+
|
| 104 |
+
# Validate max tokens range
|
| 105 |
+
if MAX_TOKENS < 1 or MAX_TOKENS > 8192:
|
| 106 |
+
MAX_TOKENS = 2048
|
src/models.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
class TutorResponse(BaseModel):
|
| 5 |
+
is_on_topic: bool = Field(..., description="True only if question matches selected topic")
|
| 6 |
+
diagnosis: Optional[str] = Field(None, description="What the user might be misunderstanding")
|
| 7 |
+
answer: str = Field(..., description="Clear, step-by-step explanation")
|
| 8 |
+
code_example: Optional[str] = Field(None, description="Minimal, runnable code if applicable")
|
| 9 |
+
best_practice_tip: Optional[str] = Field(None, description="One key tip or warning")
|
| 10 |
+
references: List[str] = Field(default_factory=list, description="Official docs or authoritative sources")
|
src/pdf_export.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pdfkit
|
| 2 |
+
import tempfile
|
| 3 |
+
import os
|
| 4 |
+
import html
|
| 5 |
+
from pygments import highlight
|
| 6 |
+
from pygments.lexers import get_lexer_by_name, guess_lexer
|
| 7 |
+
from pygments.formatters import HtmlFormatter
|
| 8 |
+
from src.utils import strip_html
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
def syntax_highlight_code(code: str, language: str = "python") -> str:
|
| 14 |
+
try:
|
| 15 |
+
lexer = get_lexer_by_name(language)
|
| 16 |
+
except:
|
| 17 |
+
try:
|
| 18 |
+
lexer = guess_lexer(code)
|
| 19 |
+
except:
|
| 20 |
+
lexer = get_lexer_by_name("text")
|
| 21 |
+
formatter = HtmlFormatter(style="friendly", cssclass="codehilite")
|
| 22 |
+
return highlight(code, lexer, formatter)
|
| 23 |
+
|
| 24 |
+
def render_chat_to_html(chat_history) -> str:
|
| 25 |
+
css = HtmlFormatter(style="friendly").get_style_defs('.codehilite')
|
| 26 |
+
html_lines = [f"""
|
| 27 |
+
<!DOCTYPE html>
|
| 28 |
+
<html>
|
| 29 |
+
<head>
|
| 30 |
+
<meta charset='utf-8'/>
|
| 31 |
+
<title>FINESE SCHOOL: Data Science Mentor Session</title>
|
| 32 |
+
<style>
|
| 33 |
+
body {{
|
| 34 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 35 |
+
line-height: 1.6;
|
| 36 |
+
padding: 30px;
|
| 37 |
+
background: #fff;
|
| 38 |
+
color: #333;
|
| 39 |
+
}}
|
| 40 |
+
h1 {{
|
| 41 |
+
color: #2c3e50;
|
| 42 |
+
text-align: center;
|
| 43 |
+
border-bottom: 2px solid #3498db;
|
| 44 |
+
padding-bottom: 10px;
|
| 45 |
+
}}
|
| 46 |
+
h2 {{
|
| 47 |
+
color: #3498db;
|
| 48 |
+
border-left: 4px solid #3498db;
|
| 49 |
+
padding-left: 10px;
|
| 50 |
+
}}
|
| 51 |
+
.message {{
|
| 52 |
+
margin-bottom: 25px;
|
| 53 |
+
padding: 20px;
|
| 54 |
+
border-radius: 12px;
|
| 55 |
+
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
| 56 |
+
}}
|
| 57 |
+
.user {{
|
| 58 |
+
background: #e3f2fd;
|
| 59 |
+
border-left: 5px solid #2196f3;
|
| 60 |
+
}}
|
| 61 |
+
.assistant {{
|
| 62 |
+
background: #f5f5f5;
|
| 63 |
+
border-left: 5px solid #757575;
|
| 64 |
+
}}
|
| 65 |
+
.diagnosis {{
|
| 66 |
+
background: #fff8e1;
|
| 67 |
+
padding: 15px;
|
| 68 |
+
border-radius: 10px;
|
| 69 |
+
margin: 15px 0;
|
| 70 |
+
border-left: 5px solid #ffc107;
|
| 71 |
+
}}
|
| 72 |
+
.tip {{
|
| 73 |
+
background: #e8f5e9;
|
| 74 |
+
border-left: 5px solid #4caf50;
|
| 75 |
+
padding: 15px;
|
| 76 |
+
border-radius: 10px;
|
| 77 |
+
margin: 15px 0;
|
| 78 |
+
}}
|
| 79 |
+
.refs {{
|
| 80 |
+
background: #f3e5f5;
|
| 81 |
+
border-left: 5px solid #9c27b0;
|
| 82 |
+
padding: 15px;
|
| 83 |
+
border-radius: 10px;
|
| 84 |
+
margin: 15px 0;
|
| 85 |
+
}}
|
| 86 |
+
.on-topic-warning {{
|
| 87 |
+
background: #ffebee;
|
| 88 |
+
border-left: 5px solid #f44336;
|
| 89 |
+
padding: 15px;
|
| 90 |
+
border-radius: 10px;
|
| 91 |
+
margin: 15px 0;
|
| 92 |
+
}}
|
| 93 |
+
.code-block {{
|
| 94 |
+
background-color: #f8f9fa;
|
| 95 |
+
border-radius: 8px;
|
| 96 |
+
padding: 15px;
|
| 97 |
+
overflow-x: auto;
|
| 98 |
+
font-family: 'Courier New', monospace;
|
| 99 |
+
font-size: 0.9em;
|
| 100 |
+
margin: 15px 0;
|
| 101 |
+
border: 1px solid #eee;
|
| 102 |
+
}}
|
| 103 |
+
{css}
|
| 104 |
+
a {{
|
| 105 |
+
color: #3498db;
|
| 106 |
+
text-decoration: none;
|
| 107 |
+
}}
|
| 108 |
+
a:hover {{
|
| 109 |
+
text-decoration: underline;
|
| 110 |
+
}}
|
| 111 |
+
</style>
|
| 112 |
+
</head>
|
| 113 |
+
<body>
|
| 114 |
+
<h1>FINESE SCHOOL: Expert Data Science Session</h1>
|
| 115 |
+
<p><em>Session exported on {__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</em></p>
|
| 116 |
+
<hr>
|
| 117 |
+
"""]
|
| 118 |
+
|
| 119 |
+
for role, content in chat_history:
|
| 120 |
+
cls = "user" if "You" in role else "assistant"
|
| 121 |
+
clean_content = strip_html(content)
|
| 122 |
+
|
| 123 |
+
# Handle special content blocks
|
| 124 |
+
import re
|
| 125 |
+
|
| 126 |
+
# Process diagnosis blocks
|
| 127 |
+
clean_content = re.sub(r'<div class="diagnosis">(.*?)</div>', r'<div class="diagnosis">\1</div>', clean_content, flags=re.DOTALL)
|
| 128 |
+
|
| 129 |
+
# Process tip blocks
|
| 130 |
+
clean_content = re.sub(r'<div class="tip">(.*?)</div>', r'<div class="tip">\1</div>', clean_content, flags=re.DOTALL)
|
| 131 |
+
|
| 132 |
+
# Process reference blocks
|
| 133 |
+
clean_content = re.sub(r'<div class="refs">(.*?)</div>', r'<div class="refs">\1</div>', clean_content, flags=re.DOTALL)
|
| 134 |
+
|
| 135 |
+
# Process code blocks
|
| 136 |
+
def replace_code_block(match):
|
| 137 |
+
code = match.group(1)
|
| 138 |
+
return f'<div class="code-block"><pre>{html.escape(code)}</pre></div>'
|
| 139 |
+
|
| 140 |
+
clean_content = re.sub(r'<div class="codehilite">(.*?)</div>', replace_code_block, clean_content, flags=re.DOTALL)
|
| 141 |
+
|
| 142 |
+
# Process on-topic warnings
|
| 143 |
+
clean_content = re.sub(r'<div class="on-topic-warning">(.*?)</div>', r'<div class="on-topic-warning">\1</div>', clean_content, flags=re.DOTALL)
|
| 144 |
+
|
| 145 |
+
html_lines.append(f'<div class="message {cls}"><h2>{role}</h2><div>{clean_content}</div></div>')
|
| 146 |
+
|
| 147 |
+
html_lines.append("</body></html>")
|
| 148 |
+
return "".join(html_lines)
|
| 149 |
+
|
| 150 |
+
def export_chat_to_pdf(chat_history) -> bytes:
|
| 151 |
+
try:
|
| 152 |
+
# Try to configure wkhtmltopdf - fallback to default if not found
|
| 153 |
+
try:
|
| 154 |
+
config = pdfkit.configuration(wkhtmltopdf="/usr/bin/wkhtmltopdf")
|
| 155 |
+
except:
|
| 156 |
+
config = None
|
| 157 |
+
|
| 158 |
+
html_content = render_chat_to_html(chat_history)
|
| 159 |
+
|
| 160 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False, encoding="utf-8") as f:
|
| 161 |
+
f.write(html_content)
|
| 162 |
+
temp_html = f.name
|
| 163 |
+
|
| 164 |
+
pdf_path = temp_html.replace(".html", ".pdf")
|
| 165 |
+
|
| 166 |
+
options = {
|
| 167 |
+
'page-size': 'A4',
|
| 168 |
+
'margin-top': '0.75in',
|
| 169 |
+
'margin-right': '0.75in',
|
| 170 |
+
'margin-bottom': '0.75in',
|
| 171 |
+
'margin-left': '0.75in',
|
| 172 |
+
'encoding': "UTF-8",
|
| 173 |
+
'no-outline': None,
|
| 174 |
+
'enable-local-file-access': None,
|
| 175 |
+
'quiet': ''
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
if config:
|
| 180 |
+
pdfkit.from_file(temp_html, pdf_path, configuration=config, options=options)
|
| 181 |
+
else:
|
| 182 |
+
pdfkit.from_file(temp_html, pdf_path, options=options)
|
| 183 |
+
|
| 184 |
+
with open(pdf_path, "rb") as f:
|
| 185 |
+
return f.read()
|
| 186 |
+
finally:
|
| 187 |
+
for path in [temp_html, pdf_path]:
|
| 188 |
+
if os.path.exists(path):
|
| 189 |
+
os.remove(path)
|
| 190 |
+
except Exception as e:
|
| 191 |
+
logger.error(f"PDF export failed: {str(e)}")
|
| 192 |
+
raise RuntimeError(f"Failed to export PDF: {str(e)}")
|
src/utils.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import html
|
| 3 |
+
import uuid
|
| 4 |
+
import logging
|
| 5 |
+
from typing import List, Tuple, Optional
|
| 6 |
+
|
| 7 |
+
# Configure logging
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
def sanitize_input(text: str) -> str:
|
| 11 |
+
"""Sanitize user input to prevent potential injection attacks.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
text: User input text
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
Sanitized text with safe characters only
|
| 18 |
+
"""
|
| 19 |
+
try:
|
| 20 |
+
# Remove any potentially harmful characters while preserving basic formatting
|
| 21 |
+
sanitized = re.sub(r'[<>]', '', text)
|
| 22 |
+
# Remove any JavaScript event handlers
|
| 23 |
+
sanitized = re.sub(r'on\w+="[^"]*"', '', sanitized, flags=re.IGNORECASE)
|
| 24 |
+
# Limit length with increased capacity
|
| 25 |
+
return sanitized[:2000]
|
| 26 |
+
except Exception as e:
|
| 27 |
+
logger.error(f"Error sanitizing input: {e}")
|
| 28 |
+
return ""
|
| 29 |
+
|
| 30 |
+
def strip_html(text: str) -> str:
|
| 31 |
+
"""Remove HTML tags from text while preserving content structure.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
text: HTML content to be stripped
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
Plain text with HTML tags removed but content structure preserved
|
| 38 |
+
"""
|
| 39 |
+
if not text:
|
| 40 |
+
return ""
|
| 41 |
+
|
| 42 |
+
# Replace line break tags with actual line breaks
|
| 43 |
+
text = text.replace('<br>', '\n')
|
| 44 |
+
text = text.replace('<br/>', '\n')
|
| 45 |
+
text = text.replace('</p>', '\n\n')
|
| 46 |
+
text = text.replace('</div>', '\n\n')
|
| 47 |
+
|
| 48 |
+
# Replace list tags with appropriate formatting
|
| 49 |
+
text = re.sub(r'</?ul>', '\n', text)
|
| 50 |
+
text = re.sub(r'</?ol>', '\n', text)
|
| 51 |
+
text = re.sub(r'<li>', '\n- ', text)
|
| 52 |
+
|
| 53 |
+
# Remove remaining HTML tags
|
| 54 |
+
clean_text = re.sub(r"<[^>]+>", "", text)
|
| 55 |
+
|
| 56 |
+
# Clean up extra whitespace
|
| 57 |
+
clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text)
|
| 58 |
+
return clean_text.strip()
|
| 59 |
+
|
| 60 |
+
def inject_interactive_elements(html_str: str) -> str:
|
| 61 |
+
"""
|
| 62 |
+
Add interactive elements to HTML content like:
|
| 63 |
+
- Copy buttons for code blocks
|
| 64 |
+
- Expandable sections for long content
|
| 65 |
+
- Syntax highlighting
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
html_str: HTML content with potential code blocks
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
HTML content with interactive elements added
|
| 72 |
+
"""
|
| 73 |
+
if not html_str or '```' not in html_str:
|
| 74 |
+
return html_str
|
| 75 |
+
|
| 76 |
+
import re
|
| 77 |
+
|
| 78 |
+
# Add copy buttons to code blocks
|
| 79 |
+
def add_copy_button(match):
|
| 80 |
+
code_content = match.group(2)
|
| 81 |
+
code_lang = match.group(1) if match.group(1) else "text"
|
| 82 |
+
button_id = str(uuid.uuid4())[:8]
|
| 83 |
+
|
| 84 |
+
return f'''
|
| 85 |
+
<div style="position: relative; margin: 10px 0;">
|
| 86 |
+
<button id="copy-btn-{button_id}" onclick="copyCode('{button_id}')"
|
| 87 |
+
style="position: absolute; top: 5px; right: 5px; z-index: 10;
|
| 88 |
+
background: #f0f0f0; border: 1px solid #ccc; border-radius: 4px;
|
| 89 |
+
padding: 4px 8px; cursor: pointer; font-size: 12px;">
|
| 90 |
+
Copy
|
| 91 |
+
</button>
|
| 92 |
+
<pre style="padding: 20px 10px 10px 10px; border-radius: 8px;
|
| 93 |
+
background: #f8f8f8; overflow-x: auto; position: relative;">
|
| 94 |
+
<code class="language-{code_lang}">{html.escape(code_content)}</code>
|
| 95 |
+
</pre>
|
| 96 |
+
</div>
|
| 97 |
+
'''
|
| 98 |
+
|
| 99 |
+
# Process code blocks with language specification
|
| 100 |
+
try:
|
| 101 |
+
result = re.sub(r'```(\w*)\n(.*?)```', add_copy_button, html_str, flags=re.DOTALL)
|
| 102 |
+
|
| 103 |
+
# Add JavaScript for copy functionality
|
| 104 |
+
js_script = """
|
| 105 |
+
<script>
|
| 106 |
+
function copyCode(elementId) {
|
| 107 |
+
const button = document.getElementById('copy-btn-' + elementId);
|
| 108 |
+
const codeBlock = button.nextElementSibling.querySelector('code');
|
| 109 |
+
const text = codeBlock.textContent;
|
| 110 |
+
|
| 111 |
+
navigator.clipboard.writeText(text).then(() => {
|
| 112 |
+
const originalText = button.textContent;
|
| 113 |
+
button.textContent = 'Copied!';
|
| 114 |
+
setTimeout(() => {
|
| 115 |
+
button.textContent = originalText;
|
| 116 |
+
}, 2000);
|
| 117 |
+
}).catch(err => {
|
| 118 |
+
console.error('Failed to copy: ', err);
|
| 119 |
+
button.textContent = 'Failed';
|
| 120 |
+
setTimeout(() => {
|
| 121 |
+
button.textContent = 'Copy';
|
| 122 |
+
}, 2000);
|
| 123 |
+
});
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
// Initialize syntax highlighting
|
| 127 |
+
document.addEventListener('DOMContentLoaded', (event) => {
|
| 128 |
+
document.querySelectorAll('pre code').forEach((el) => {
|
| 129 |
+
hljs.highlightElement(el);
|
| 130 |
+
});
|
| 131 |
+
});
|
| 132 |
+
</script>
|
| 133 |
+
"""
|
| 134 |
+
|
| 135 |
+
# Add syntax highlighting CSS if needed
|
| 136 |
+
css_link = '<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/github.min.css">\n'
|
| 137 |
+
hljs_script = '<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>\n'
|
| 138 |
+
|
| 139 |
+
# Add the script and CSS if we have code blocks
|
| 140 |
+
result = css_link + hljs_script + result + js_script
|
| 141 |
+
|
| 142 |
+
return result
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"Error adding interactive elements: {e}")
|
| 145 |
+
return html_str
|
| 146 |
+
|
| 147 |
+
def detect_language_from_context(question: str, topic: str) -> str:
|
| 148 |
+
"""Detect the programming language based on question and topic context.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
question: User's question text
|
| 152 |
+
topic: Main topic of the query
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
Detected programming language code
|
| 156 |
+
"""
|
| 157 |
+
# Language mapping with common indicators
|
| 158 |
+
mapping = {
|
| 159 |
+
"Python": ["python", "pandas", "numpy", "matplotlib", "dataframe"],
|
| 160 |
+
"SQL": ["sql", "query", "database", "select", "join"],
|
| 161 |
+
"JavaScript": ["javascript", "js", "react", "dom", "node"],
|
| 162 |
+
"Java": ["java", "spring", "hibernate"],
|
| 163 |
+
"C#": ["c#", "csharp", "dotnet", ".net"],
|
| 164 |
+
"Power BI": ["dax", "powerbi", "power bi", "pbix"],
|
| 165 |
+
"Data Visualization": ["visualization", "chart", "plot", "graph"],
|
| 166 |
+
"HTML": ["html", "markup", "webpage"],
|
| 167 |
+
"CSS": ["css", "stylesheet"],
|
| 168 |
+
"Shell": ["bash", "shell", "command", "script"]
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# Check topic first with exact matches
|
| 172 |
+
for lang, keywords in mapping.items():
|
| 173 |
+
for keyword in keywords:
|
| 174 |
+
if keyword.lower() in topic.lower():
|
| 175 |
+
return lang.lower()
|
| 176 |
+
|
| 177 |
+
# Check question for additional clues
|
| 178 |
+
question_lower = question.lower()
|
| 179 |
+
for lang, keywords in mapping.items():
|
| 180 |
+
for keyword in keywords:
|
| 181 |
+
if keyword.lower() in question_lower:
|
| 182 |
+
return lang.lower()
|
| 183 |
+
|
| 184 |
+
return "text"
|
| 185 |
+
|
| 186 |
+
def truncate_text(text: str, max_length: int = 500, min_length: int = 200) -> str:
|
| 187 |
+
"""Truncate text to a maximum length while trying to preserve meaningful content.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
text: Text to truncate
|
| 191 |
+
max_length: Maximum length for the truncated text
|
| 192 |
+
min_length: Minimum length before adding ellipsis
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
Truncated text with ellipsis if needed
|
| 196 |
+
"""
|
| 197 |
+
if not text:
|
| 198 |
+
return ""
|
| 199 |
+
|
| 200 |
+
if len(text) <= max_length:
|
| 201 |
+
return text
|
| 202 |
+
|
| 203 |
+
# Try to find a natural break point
|
| 204 |
+
space_index = text.rfind(' ', min_length, max_length)
|
| 205 |
+
if space_index > 0:
|
| 206 |
+
return text[:space_index] + "..."
|
| 207 |
+
|
| 208 |
+
# Fallback to simple truncation
|
| 209 |
+
return text[:max_length] + "..."
|
tests/test_chat_engine.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_chat_engine.py
|
| 2 |
+
import unittest
|
| 3 |
+
from src.chat_engine import build_expert_prompt
|
| 4 |
+
from src.config import TOPIC_REGISTRY
|
| 5 |
+
|
| 6 |
+
class TestChatEngine(unittest.TestCase):
|
| 7 |
+
def test_build_expert_prompt(self):
|
| 8 |
+
topic_spec = TOPIC_REGISTRY["Python"]
|
| 9 |
+
prompt = build_expert_prompt(topic_spec, "What is a decorator?")
|
| 10 |
+
self.assertIn("Dr. Data", prompt.messages[0].content)
|