bfh-studadmin-assist / app_unified.py
awellis's picture
Add @spaces.GPU decorator to fix HF Spaces runtime error
36f54bf
"""Unified application with toggle between Simple and Multi-Agent modes."""
import gradio as gr
import asyncio
import logging
from pathlib import Path
from typing import Tuple
# Import spaces for HuggingFace GPU decorator
try:
import spaces
HF_SPACES = True
except ImportError:
HF_SPACES = False
# Simple mode imports
from openai import OpenAI
from src.config import get_config
from src.document_processing.loader import MarkdownDocumentLoader
from src.document_processing.chunker import SemanticChunker
from src.indexing.memory_indexer import MemoryDocumentIndexer
from src.retrieval.memory_retriever import MemoryRetriever
from src.ui.formatters import ChunkFormatter
# Multi-agent mode imports - completely lazy, only import when needed
MULTI_AGENT_AVAILABLE = True # Assume available, will check when needed
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Dummy GPU function to satisfy HuggingFace Spaces GPU check
if HF_SPACES:
@spaces.GPU
def _hf_spaces_gpu_warmup():
"""Dummy function to satisfy HF Spaces GPU decorator requirement."""
return "GPU ready"
class UnifiedAssistant:
"""Unified assistant supporting both simple and multi-agent modes."""
@staticmethod
def _format_documents_html(documents) -> str:
"""Format Haystack documents as simple HTML."""
if not documents:
return "<p>No documents retrieved.</p>"
html_parts = []
for i, doc in enumerate(documents, 1):
source = doc.meta.get("source_file", "Unknown") if hasattr(doc, 'meta') and doc.meta else "Unknown"
score = getattr(doc, 'score', 0.0)
content = doc.content if hasattr(doc, 'content') else str(doc)
# Truncate long content
preview = content[:200] + "..." if len(content) > 200 else content
html_parts.append(f"""
<div style="border: 1px solid #ddd; border-radius: 8px; padding: 16px; margin-bottom: 16px; background: #f9f9f9; color: #1f2937;">
<div style="margin-bottom: 12px; color: #1f2937;">
<strong>#{i}</strong> |
<span style="color: #2563eb;">📄 {source}</span> |
<span style="color: #666;">Score: {score:.3f}</span>
</div>
<details>
<summary style="cursor: pointer; padding: 8px; background: white; border-radius: 4px; margin-bottom: 8px; color: #1f2937;">
<strong>Preview:</strong> {preview}
</summary>
<div style="padding: 12px; background: white; border-radius: 4px; margin-top: 8px; white-space: pre-wrap; font-size: 0.9em; color: #1f2937;">
{content}
</div>
</details>
</div>
""")
return "".join(html_parts)
def __init__(self):
self.config = get_config()
self.client = OpenAI(api_key=self.config.llm.api_key)
# Load documents (shared by both modes)
self.indexer = MemoryDocumentIndexer(llm_config=self.config.llm)
self._load_or_create_documents()
# Initialize retriever (for simple mode)
self.retriever = MemoryRetriever(
document_store=self.indexer.document_store,
llm_config=self.config.llm,
retrieval_config=self.config.retrieval,
)
# Initialize orchestrator (for multi-agent mode) - lazy
self.orchestrator = None
def _load_or_create_documents(self):
"""Load documents from JSON or create fresh."""
import json
from haystack import Document as HaystackDoc
json_path = Path("data/embedded_documents.json")
if json_path.exists():
logger.info(f"Loading embedded documents from {json_path}...")
try:
with open(json_path, "r") as f:
docs_data = json.load(f)
documents = []
for doc_data in docs_data:
doc = HaystackDoc(
id=doc_data.get("id"),
content=doc_data["content"],
embedding=doc_data.get("embedding"),
meta=doc_data.get("meta", {})
)
documents.append(doc)
self.indexer.document_store.write_documents(documents)
logger.info(f"Loaded {len(documents)} documents with embeddings")
return
except Exception as e:
logger.warning(f"Failed to load documents: {e}")
# Create documents if not found
logger.info("Creating fresh document index...")
loader = MarkdownDocumentLoader(self.config.document_processing.documents_path)
documents = loader.load_documents()
chunker = SemanticChunker(
chunk_size=self.config.document_processing.chunk_size,
chunk_overlap=self.config.document_processing.chunk_overlap,
min_chunk_size=self.config.document_processing.min_chunk_size,
)
chunked_docs = chunker.chunk_documents(documents)
self.indexer.index_documents(chunked_docs)
def _get_orchestrator(self):
"""Lazy load orchestrator for multi-agent mode."""
if self.orchestrator is None:
try:
from src.pipeline.memory_orchestrator import MemoryRAGOrchestrator
self.orchestrator = MemoryRAGOrchestrator(
config=self.config,
document_indexer=self.indexer # Correct parameter name
)
except ImportError as e:
raise RuntimeError(f"Multi-agent mode is not available: {e}")
return self.orchestrator
def process_query_simple(self, query: str) -> Tuple[str, str, str]:
"""Process query with simple single-LLM mode (fast)."""
logger.info(f"[SIMPLE MODE] Processing query: {query}")
# Retrieve documents
retrieved_docs = self.retriever.retrieve(query)
logger.info(f"Retrieved {len(retrieved_docs)} documents")
# Build context
max_docs = 2 if "gpt-5" in self.config.llm.model_name else 3
max_chars_per_doc = 800 if "gpt-5" in self.config.llm.model_name else 1500
context_parts = []
for i, doc in enumerate(retrieved_docs[:max_docs], 1):
source = doc.meta.get("source_file", "Unknown")
content = doc.content[:max_chars_per_doc]
context_parts.append(f"[Dokument {i}: {source}]\n{content}\n")
context = "\n".join(context_parts) if context_parts else "Keine relevanten Dokumente gefunden."
# Generate email with single LLM call
system_prompt = """Du bist ein hilfreicher Assistent für die Studienadministration der BFH.
Deine Aufgabe ist es, professionelle E-Mail-Antworten auf Studentenanfragen zu verfassen.
Richtlinien:
- Antworte in der gleichen Sprache wie die Anfrage (Deutsch, Englisch oder Französisch)
- Verwende einen professionellen aber freundlichen Ton
- Sei klar, präzise und hilfreich
- Beziehe dich auf konkrete Formulare, Fristen oder Verfahren wenn relevant
- Gib klare nächste Schritte an
- Wenn Informationen fehlen, sage dies ehrlich
Für deutsche E-Mails:
- Verwende die formelle "Sie"-Form
- Grußformel: "Guten Tag" oder "Sehr geehrte/r..."
- Schlussformel: "Freundliche Grüsse" oder "Mit freundlichen Grüssen"
"""
user_prompt = f"""Beantworte die folgende Anfrage basierend auf den verfügbaren Informationen:
Anfrage: {query}
Verfügbare Informationen:
{context}
Verfasse eine vollständige professionelle E-Mail-Antwort."""
try:
# GPT-5 models have different parameter requirements
completion_params = {
"model": self.config.llm.model_name,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
}
# GPT-5 uses max_completion_tokens and supports reasoning_effort parameter
if "gpt-5" in self.config.llm.model_name:
completion_params["max_completion_tokens"] = self.config.llm.max_tokens
# Don't set temperature for GPT-5 (only supports default 1.0)
# Use minimal reasoning effort to get actual output instead of all reasoning tokens
completion_params["reasoning_effort"] = "minimal"
else:
completion_params["max_tokens"] = self.config.llm.max_tokens
completion_params["temperature"] = self.config.llm.temperature
response = self.client.chat.completions.create(**completion_params)
logger.info(f"[DEBUG] Response object: {response}")
logger.info(f"[DEBUG] Response.choices: {response.choices}")
email = response.choices[0].message.content
if email is None or email.strip() == "":
logger.error(f"LLM returned null or empty response!")
logger.error(f"[DEBUG] Full response: {response.model_dump()}")
email = "Error: The model returned an empty response. Please try again."
except Exception as e:
logger.error(f"Error generating email: {e}")
email = f"Error generating response: {str(e)}"
# Format chunks for display
chunks_html = self._format_documents_html(retrieved_docs)
# Create metadata
metadata = f"""**Mode**: Simple (Single LLM call)
**Model**: {self.config.llm.model_name}
**Documents Retrieved**: {len(retrieved_docs)}
**Documents Used**: {min(len(retrieved_docs), max_docs)}
"""
return email, chunks_html, metadata
async def process_query_multi_agent(self, query: str) -> Tuple[str, str, str]:
"""Process query with multi-agent mode (high quality, async parallel)."""
logger.info(f"[MULTI-AGENT MODE] Processing query: {query}")
orchestrator = self._get_orchestrator()
result = await orchestrator.process_query(query)
# Format email
email = f"""Subject: {result.email_draft.subject}
{result.email_draft.body}"""
# Format chunks - result.retrieved_docs are dicts, need to convert back
from haystack import Document as HaystackDoc
docs = []
for doc_dict in result.retrieved_docs:
if isinstance(doc_dict, dict):
doc = HaystackDoc(
content=doc_dict.get('content', ''),
meta=doc_dict.get('meta', {}),
id=doc_dict.get('id')
)
if 'score' in doc_dict:
doc.score = doc_dict['score']
docs.append(doc)
chunks_html = self._format_documents_html(docs)
# Create metadata
mode_type = "Parallel ⚡" if self.config.use_parallel_processing else "Sequential 🐌"
metadata = f"""**Mode**: Multi-Agent ({mode_type})
**Model**: {self.config.llm.model_name}
**Processing Time**: {result.processing_time:.1f}s
**Documents Retrieved**: {len(result.retrieved_docs)}
**Intent Detected**:
- Action: {result.intent.action_type}
- Topic: {result.intent.topic}
- Language: {result.intent.language}
**Fact Check**:
- Accuracy: {result.fact_check.accuracy_score:.0%}
- Status: {'✓ Accurate' if result.fact_check.is_accurate else '⚠ Issues Found'}
- Issues: {len(result.fact_check.issues_found)}
"""
if result.fact_check.issues_found:
metadata += "\n**Issues**:\n"
for issue in result.fact_check.issues_found[:3]: # Show first 3
metadata += f"- {issue}\n"
return email, chunks_html, metadata
def process_query(self, query: str, mode: str) -> Tuple[str, str, str]:
"""Process query with selected mode."""
if not query or not query.strip():
return "Please enter a query.", "", ""
try:
if mode == "Simple (Fast)":
return self.process_query_simple(query)
else: # Multi-Agent
# Run async function (will check for dependencies inside)
return asyncio.run(self.process_query_multi_agent(query))
except Exception as e:
logger.error(f"Error processing query: {e}", exc_info=True)
return f"Error: {str(e)}", "", ""
# Initialize assistant
logger.info("Initializing Unified Assistant...")
assistant = UnifiedAssistant()
logger.info("Assistant ready!")
# Example queries
EXAMPLE_QUERIES = [
"Wie kann ich mich exmatrikulieren?",
"What are the deadlines for leave of absence?",
"Wie ändere ich meinen Namen in den Studiendokumenten?",
"Welche Versicherungen brauche ich als Student?",
]
# Create Gradio interface
with gr.Blocks(title="BFH Student Administration Assistant") as demo:
gr.Markdown("# 🎓 BFH Student Administration Email Assistant")
gr.Markdown("""
Ask questions about BFH student administration and receive professional email responses.
**Modes**:
- **Simple (Fast)**: Single LLM call (~5-10s) - Best for quick responses
- **Multi-Agent (Quality)**: Intent + Compose + Fact-Check (~60-75s) - Best for accuracy
""")
with gr.Row():
with gr.Column(scale=2):
mode_radio = gr.Radio(
choices=["Simple (Fast)", "Multi-Agent (Quality)"],
value="Simple (Fast)",
label="Processing Mode",
info="Simple mode is faster, Multi-Agent provides higher quality"
)
query_input = gr.Textbox(
label="Your Question",
placeholder="e.g., Wie kann ich mich exmatrikulieren?",
lines=3
)
submit_btn = gr.Button("Generate Email Response", variant="primary")
gr.Examples(
examples=EXAMPLE_QUERIES,
inputs=query_input,
label="Example Questions"
)
with gr.Column(scale=3):
email_output = gr.Textbox(
label="Generated Email",
lines=15,
show_copy_button=True
)
metadata_output = gr.Markdown(label="Processing Info")
with gr.Accordion("Retrieved Source Documents", open=False):
chunks_output = gr.HTML(label="Source Chunks")
submit_btn.click(
fn=assistant.process_query,
inputs=[query_input, mode_radio],
outputs=[email_output, chunks_output, metadata_output]
)
if __name__ == "__main__":
demo.launch()