Nihal2000 commited on
Commit
2c1203c
·
1 Parent(s): b54196e
.dockerignore ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Virtual environments
9
+ venv/
10
+ env/
11
+ ENV/
12
+ .venv
13
+
14
+ # IDE
15
+ .vscode/
16
+ .idea/
17
+ *.swp
18
+ *.swo
19
+ *~
20
+
21
+ # Git
22
+ .git/
23
+ .gitignore
24
+ .gitattributes
25
+
26
+ # CI/CD
27
+ .github/
28
+ .gitlab-ci.yml
29
+
30
+ # Documentation
31
+ README.md
32
+ docs/
33
+ *.md
34
+ !requirements.txt
35
+
36
+ # Test files
37
+ test_*.py
38
+ *_test.py
39
+ tests/
40
+ .pytest_cache/
41
+
42
+ # Large data files (these should be in volumes)
43
+ data/
44
+ vector_store/
45
+ documents/
46
+ podcasts/
47
+ *.db
48
+ *.sqlite
49
+
50
+ # Logs
51
+ *.log
52
+ logs/
53
+
54
+ # OS files
55
+ .DS_Store
56
+ Thumbs.db
57
+
58
+ # Deployment files (not needed in container)
59
+ deploy_from_env.py
60
+ modal_deploy.py
61
+ blaxel.yaml
62
+ bl.cmd
63
+ test_persistence.py
64
+
65
+ # Environment files
66
+ .env
67
+ .env.*
CLIENT_SETUP.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🔌 Connect to Claude Desktop
2
+
3
+ You can use the **AI Digital Library Assistant** as a tool provider for Claude Desktop! This allows you to chat with Claude and have it directly access your library, search documents, and even trigger podcast generation.
4
+
5
+ ## Prerequisites
6
+
7
+ - [Claude Desktop App](https://claude.ai/download) installed.
8
+ - The **AI Digital Library Assistant** running (either locally or on Hugging Face Spaces).
9
+
10
+ ## Configuration
11
+
12
+ 1. Open your Claude Desktop configuration file:
13
+ - **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
14
+ - **Mac**: `~/Library/Application Support/Claude/claude_desktop_config.json`
15
+
16
+ 2. Add the following configuration:
17
+
18
+ ```json
19
+ {
20
+ "mcpServers": {
21
+ "ai-library": {
22
+ "command": "npx",
23
+ "args": [
24
+ "-y",
25
+ "mcp-remote",
26
+ "https://mcp-1st-birthday-ai-digital-library-assistant.hf.space/gradio_api/mcp/sse"
27
+ ]
28
+ }
29
+ }
30
+ }
31
+
32
+ ```
33
+
34
+ > If running locally, you can use the local SSE endpoint (usually `http://localhost:7860/sse`).
35
+
36
+ 3. **Restart Claude Desktop**.
37
+
38
+ ## What You Can Do
39
+
40
+ Once connected, you can ask Claude things like:
41
+
42
+ - "Search my library for documents."
43
+ - "Summarize the last PDF I uploaded."
44
+ - "Create a podcast script from these search results."
45
+ - "Generate tags for this document ID."
46
+
47
+
QUICKSTART.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 Quickstart Guide
2
+
3
+ Get your **AI Digital Library Assistant** up and running in minutes!
4
+
5
+ ## Prerequisites
6
+
7
+ - **Python 3.10+** installed on your system.
8
+ - **OpenAI API Key** (for RAG and summarization).
9
+ - **ElevenLabs API Key** (optional, for Voice & Podcast features).
10
+
11
+ ## 🛠️ Installation
12
+
13
+ 1. **Clone the repository:**
14
+ ```bash
15
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/AiDigitalLibraryAssistant
16
+ cd AiDigitalLibraryAssistant
17
+ ```
18
+
19
+ 2. **Create a virtual environment (recommended):**
20
+ ```bash
21
+ python -m venv venv
22
+ # Windows
23
+ venv\Scripts\activate
24
+ # Mac/Linux
25
+ source venv/bin/activate
26
+ ```
27
+
28
+ 3. **Install dependencies:**
29
+ ```bash
30
+ pip install -r requirements.txt
31
+ ```
32
+
33
+ ## ⚙️ Configuration
34
+
35
+ 1. Create a `.env` file in the root directory:
36
+ ```bash
37
+ touch .env
38
+ ```
39
+
40
+ 2. Add your API keys to `.env`:
41
+ ```env
42
+ # Required for core features
43
+ OPENAI_API_KEY=sk-your_openai_key_here
44
+
45
+ # Required for Voice & Podcast features
46
+ ELEVENLABS_API_KEY=your_elevenlabs_key_here
47
+ ELEVENLABS_AGENT_ID=your_agent_id_here
48
+ ```
49
+
50
+ ## 🏃‍♂️ Running the App
51
+
52
+ 1. **Start the application:**
53
+ ```bash
54
+ python app.py
55
+ ```
56
+
57
+ 2. **Open your browser:**
58
+ The app will launch automatically at `http://localhost:7860`.
59
+
60
+ ## 💡 Usage Tips
61
+
62
+ - **Upload Documents:** Go to the "Library" tab to upload PDF, TXT, or DOCX files.
63
+ - **Chat:** Use the "Chat" tab to ask questions about your documents.
64
+ - **Voice Mode:** Click "Start Conversation" to talk to your library!
65
+ - **Podcast Studio:** Select documents and generate a custom podcast episode.
66
+
67
+ ---
68
+ *Built for the MCP-1st-Birthday Hackathon* 🎈
README.md CHANGED
@@ -1,12 +1,256 @@
1
  ---
2
- title: AI Digital Library Assistant Agent
3
- emoji: 🐢
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.2.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AI Digital Library Assistant
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: "5.38.0"
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+
12
  ---
13
 
14
+
15
+ The **AI Digital Library Assistant** is a next-generation knowledge management tool built for the **MCP 1st Birthday Hackathon**. It transforms your static document collection into an interactive, living library.
16
+
17
+ Unlike traditional RAG (Retrieval Augmented Generation) apps, this project leverages the **Model Context Protocol (MCP)** to create a modular ecosystem of tools—Ingestion, Search, and Podcast Generation—that work harmoniously to help you consume information in the way that suits *you* best.
18
+
19
+ ```mermaid
20
+ graph TD
21
+ User((👤 User))
22
+
23
+ subgraph "Frontend (Gradio)"
24
+ UI[Web Interface]
25
+ PodcastUI[Podcast Studio]
26
+ end
27
+
28
+ subgraph "MCP Server Layer"
29
+ MCPServer[Content Organizer MCP Server]
30
+
31
+ subgraph "MCP Tools"
32
+ IngestTool[📥 Ingestion Tool]
33
+ SearchTool[🔍 Search Tool]
34
+ GenTool[✨ Generative Tool]
35
+ PodTool[🎧 Podcast Tool]
36
+ end
37
+ end
38
+
39
+ subgraph "Service Layer"
40
+ VecStore[(Vector Store)]
41
+ DocStore[(Document Store)]
42
+ LLM[LLM Service (OpenAI / Nebius AI)]
43
+ ElevenLabs[ElevenLabs API]
44
+ LlamaIndex[LlamaIndex Agent]
45
+ end
46
+
47
+ User <--> UI
48
+ UI <--> MCPServer
49
+
50
+ MCPServer --> IngestTool
51
+ MCPServer --> SearchTool
52
+ MCPServer --> GenTool
53
+ MCPServer --> PodTool
54
+
55
+ IngestTool --> VecStore
56
+ IngestTool --> DocStore
57
+ SearchTool --> VecStore
58
+ GenTool --> LLM
59
+ PodTool --> LlamaIndex
60
+ PodTool --> ElevenLabs
61
+ PodTool --> LLM
62
+ ```
63
+
64
+ ![AI LIB](https://cdn-uploads.huggingface.co/production/uploads/66f1712d906c08084995f808/TSJexR45eNpUjHhbHDOag.png)
65
+
66
+
67
+ ## 🚀 Quick Start
68
+
69
+ Check out [QUICKSTART.md](QUICKSTART.md) for detailed local setup instructions.
70
+
71
+ 1. **Clone & Install**:
72
+ ```bash
73
+ git clone https://huggingface.co/spaces/Nihal2000/AiDigitalLibraryAssistant
74
+ pip install -r requirements.txt
75
+ ```
76
+ 2. **Configure**: Add your `OPENAI_API_KEY` and `ELEVENLABS_API_KEY` to `.env`.
77
+ 3. **Run**: `python app.py`
78
+
79
+ ## 💡 How It Works
80
+
81
+ ### 1. The MCP Core
82
+ At the heart of the application is the `AiDigitalLibraryAssistant`. It exposes atomic capabilities (Tools) that the frontend consumes. This means the same tools powering this UI could be connected to Claude Desktop or any other MCP client!
83
+
84
+ ```bash
85
+ {
86
+ "mcpServers": {
87
+ "ai-library": {
88
+ "command": "npx",
89
+ "args": [
90
+ "-y",
91
+ "mcp-remote",
92
+ "https://mcp-1st-birthday-ai-digital-library-assistant.hf.space/gradio_api/mcp/sse"
93
+ ]
94
+ }
95
+ }
96
+ }
97
+ ```
98
+
99
+ ### 2. 🎧 Podcast Studio (Star Feature)
100
+ Turn your reading list into a playlist! The **Podcast Studio** is a flagship feature that transforms any selection of documents into an engaging, multi-speaker audio podcast.
101
+ - **Intelligent Scripting**: Uses **LlamaIndex** and **OpenAI/Nebius AI** to analyze your documents and generate a natural, conversational script.
102
+ - **Multi-Speaker Synthesis**: Leverages **ElevenLabs** to bring the script to life with distinct, realistic voices for each host.
103
+ - **Customizable**: Choose your style (Educational, Casual, Teaching) and duration.
104
+
105
+
106
+ ### ✨ Features
107
+ ## 📚 Document Management
108
+
109
+ Multi-format Support: PDF, DOCX, TXT, and image files (PNG, JPG, JPEG)
110
+ Intelligent OCR: Automatic text extraction from images and scanned documents
111
+ Semantic Chunking: Documents automatically split into meaningful segments for better retrieval
112
+ Metadata Tracking: Comprehensive document metadata including file size, type, creation date, and custom tags
113
+ Vector Embeddings: All documents indexed with dense vector embeddings for semantic search
114
+
115
+ ## 🔍 Advanced Search
116
+
117
+ Semantic Search: Find documents by meaning, not just keywords
118
+ Configurable Results: Adjust the number of results (1-20) based on your needs
119
+ Relevance Scoring: Each result includes a confidence score
120
+ Source Attribution: Direct links to source documents with highlighted excerpts
121
+
122
+ ## 🎨 Content Studio
123
+ Transform your documents with 8 powerful AI tools:
124
+
125
+ Summarize: Generate concise, detailed, bullet-point, or executive summaries
126
+ Generate Outline: Create structured outlines from topics or documents (3-10 sections)
127
+ Explain Concept: Get explanations tailored to different audiences (general, technical, beginner, expert)
128
+ Paraphrase: Rewrite text in various styles (formal, casual, academic, simple, technical)
129
+ Categorize: Automatically classify content into user-defined categories
130
+ Key Insights: Extract the most important points from any document
131
+ Generate Questions: Create comprehension, analysis, application, creative, or factual questions
132
+ Extract Key Info: Pull out structured information (entities, dates, facts) in JSON format
133
+
134
+ ## 🏷️ Smart Tagging
135
+
136
+ AI-Generated Tags: Automatically generate 3-15 relevant tags for any document
137
+ Persistent Storage: Tags saved directly to document metadata
138
+ Batch Processing: Tag multiple documents or custom text snippets
139
+
140
+ ## ❓ RAG-Powered Q&A
141
+
142
+ Context-Aware Answers: Ask questions and get answers grounded in your documents
143
+ Source Citations: Every answer includes relevant source excerpts
144
+ Confidence Scoring: Transparency about answer reliability
145
+ Multi-Document Synthesis: Answers can draw from multiple documents simultaneously
146
+
147
+ ## 🎙️ Podcast Studio
148
+ Convert documents into engaging audio conversations:
149
+
150
+ AI Voice Generation: Ultra-realistic voices powered by ElevenLabs
151
+ Two-Host Format: Dynamic dialogue between two AI personalities
152
+ Multiple Styles: Conversational, educational, technical, or casual
153
+ Custom Duration: 5-30 minute podcasts
154
+ Voice Selection: Choose from 7+ professional AI voices
155
+ Full Transcripts: Complete text transcripts for every generated podcast
156
+ Podcast Library: Browse, play, and manage all generated podcasts
157
+
158
+ ## 📊 Dashboard & Analytics
159
+
160
+ Real-time Stats: Track total documents, vector chunks, and storage usage
161
+ Recent Activity: View recently added documents at a glance
162
+ System Health: Monitor vector store, LLM service, and voice service status
163
+
164
+ ## Data Flow
165
+
166
+ ## Document Ingestion:
167
+ - Files → OCR → Text Extraction → Chunking → Embedding Generation → Vector Store
168
+ ## Semantic Search:
169
+ - Query → Embedding → Vector Search → Relevance Ranking → Results
170
+ ## RAG Q&A:
171
+ - Question → Search → Context Retrieval → LLM Generation → Answer + Sources
172
+ ## Podcast Generation:
173
+ - Documents → Content Analysis → Script Generation → Voice Synthesis → Audio File
174
+
175
+ ### Basic Workflow
176
+ 1. Upload Documents
177
+ Navigate to the "📄 Upload Documents" tab:
178
+
179
+ Click "Select a document" or drag-and-drop files
180
+ Supported formats: PDF, DOCX, TXT, PNG, JPG, JPEG
181
+ Click "🚀 Process & Add to Library"
182
+ Wait for processing to complete (OCR runs automatically for images)
183
+ Note the Document ID from the output
184
+
185
+ 2. Search Your Library
186
+ Go to "🔍 Search Documents":
187
+
188
+ Enter a natural language query (e.g., "What are the key findings about climate change?")
189
+ Adjust "Number of Results" slider (1-20)
190
+ Click "🔍 Search"
191
+ Review results with relevance scores and source excerpts
192
+
193
+ 3. Ask Questions
194
+ Navigate to "❓ Ask Questions":
195
+
196
+ Type your question about uploaded documents
197
+ Click "❓ Get Answer"
198
+ Receive AI-generated answer with source citations
199
+ Check confidence level and review source documents
200
+
201
+ 4. Generate Content
202
+ Open "📝 Content Studio":
203
+
204
+ Select a document from dropdown OR paste custom text
205
+ Choose a task from the dropdown:
206
+
207
+ Summarize, Outline, Explain, Paraphrase, etc.
208
+
209
+
210
+ Configure task-specific options in "⚙️ Advanced Options"
211
+ Click "🚀 Run Task"
212
+ Copy or download the generated content
213
+
214
+ 5. Create Podcasts
215
+ Visit "🎧 Podcast Studio":
216
+
217
+ Select 1-5 documents using checkboxes
218
+ Choose Style (conversational, educational, technical, casual)
219
+ Set Duration (5-30 minutes)
220
+ Select voices for Host 1 and Host 2
221
+ Click "🎙️ Generate Podcast"
222
+ Listen to the generated audio and read the transcript
223
+ Browse past podcasts in the Podcast Library
224
+
225
+ 6. Generate Tags
226
+ Go to "🏷️ Generate Tags":
227
+
228
+ Select a document OR paste custom text
229
+ Adjust "Number of Tags" slider (3-15)
230
+ Click "🏷️ Generate Tags"
231
+
232
+ ## 🏆 Hackathon Tracks
233
+
234
+ We are submitting to:
235
+ - **Building MCP**: For our custom `AiDigitalLibraryAssistant` MCP server implementation.
236
+ - **MCP in Action (Consumer/Creative)**: For the innovative Podcast interface that makes personal knowledge management accessible and fun.
237
+
238
+ ## 📜 License
239
+
240
+ MIT License. Built with ❤️ for the AI community.
241
+
242
+ ## 🙏 Acknowledgements & Sponsors
243
+
244
+ This project was built for the **MCP 1st Birthday Hackathon** and proudly leverages technology from:
245
+
246
+ - **[OpenAI](https://openai.com)**: Providing the foundational intelligence for our document analysis and content generation.
247
+ - **[Nebius AI](https://nebius.com)**: Powering our high-performance inference needs.
248
+ - **[LlamaIndex](https://www.llamaindex.ai)**: The backbone of our data orchestration, enabling sophisticated RAG and agentic workflows for the Podcast Studio.
249
+ - **[ElevenLabs](https://elevenlabs.io)**: Bringing our podcasts to life with industry-leading, hyper-realistic text-to-speech.
250
+ - **[Hugging Face](https://huggingface.co)**: Hosting our application on **Spaces** and providing the **Gradio** framework for our beautiful, responsive UI.
251
+ - **[Anthropic](https://anthropic.com)**: For pioneering the **Model Context Protocol (MCP)** that makes this modular architecture possible.
252
+
253
+ ## 🔌 Connect to Claude
254
+
255
+ Want to use these tools directly inside Claude Desktop?
256
+ Check out our [Client Setup Guide](CLIENT_SETUP.md) to connect this MCP server to your local Claude instance!
app.py ADDED
@@ -0,0 +1,1854 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import tempfile
7
+ import uuid
8
+ import requests
9
+ from PIL import Image
10
+ from io import BytesIO
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ from typing import List, Dict, Any, Optional
14
+ import nest_asyncio
15
+
16
+ # Apply nest_asyncio to handle nested event loops in Gradio
17
+ nest_asyncio.apply()
18
+
19
+ # Import our custom modules
20
+ from mcp_tools.ingestion_tool import IngestionTool
21
+ from mcp_tools.search_tool import SearchTool
22
+ from mcp_tools.generative_tool import GenerativeTool
23
+ from mcp_tools.voice_tool import VoiceTool
24
+ from mcp_tools.podcast_tool import PodcastTool
25
+ from services.vector_store_service import VectorStoreService
26
+ from services.document_store_service import DocumentStoreService
27
+ from services.embedding_service import EmbeddingService
28
+ from services.llm_service import LLMService
29
+ from services.ocr_service import OCRService
30
+ from services.llamaindex_service import LlamaIndexService
31
+ from services.elevenlabs_service import ElevenLabsService
32
+ from services.podcast_generator_service import PodcastGeneratorService
33
+ from core.models import SearchResult, Document
34
+ import config
35
+ from mcp_server import mcp as fast_mcp
36
+
37
+ # Setup logging
38
+ logging.basicConfig(level=logging.INFO)
39
+ logger = logging.getLogger(__name__)
40
+
41
+ class ContentOrganizerMCPServer:
42
+ def __init__(self):
43
+ # Initialize services
44
+ logger.info("Initializing Content Organizer MCP Server...")
45
+ self.vector_store = VectorStoreService()
46
+ self.document_store = DocumentStoreService()
47
+ self.embedding_service = EmbeddingService()
48
+ self.llm_service = LLMService()
49
+ self.ocr_service = OCRService()
50
+ self.llamaindex_service = LlamaIndexService(self.document_store)
51
+
52
+ # Initialize ElevenLabs voice service
53
+ self.elevenlabs_service = ElevenLabsService(self.llamaindex_service)
54
+
55
+ # Initialize Podcast Generator
56
+ self.podcast_generator = PodcastGeneratorService(
57
+ llamaindex_service=self.llamaindex_service,
58
+ llm_service=self.llm_service
59
+ )
60
+
61
+ # Initialize tools
62
+ self.ingestion_tool = IngestionTool(
63
+ vector_store=self.vector_store,
64
+ document_store=self.document_store,
65
+ embedding_service=self.embedding_service,
66
+ ocr_service=self.ocr_service
67
+ )
68
+ self.search_tool = SearchTool(
69
+ vector_store=self.vector_store,
70
+ embedding_service=self.embedding_service,
71
+ document_store=self.document_store
72
+ )
73
+ self.generative_tool = GenerativeTool(
74
+ llm_service=self.llm_service,
75
+ search_tool=self.search_tool
76
+ )
77
+ self.voice_tool = VoiceTool(self.elevenlabs_service)
78
+ self.podcast_tool = PodcastTool(self.podcast_generator)
79
+
80
+ # Track processing status
81
+ self.processing_status = {}
82
+
83
+ # Document cache for quick access
84
+ self.document_cache = {}
85
+ logger.info("Content Organizer MCP Server initialized successfully!")
86
+
87
+ def run_async(self, coro):
88
+ """Helper to run async functions in Gradio"""
89
+ try:
90
+ loop = asyncio.get_event_loop()
91
+ except RuntimeError:
92
+ loop = asyncio.new_event_loop()
93
+ asyncio.set_event_loop(loop)
94
+ if loop.is_running():
95
+ # If loop is already running, create a task
96
+ import concurrent.futures
97
+ with concurrent.futures.ThreadPoolExecutor() as executor:
98
+ future = executor.submit(asyncio.run, coro)
99
+ return future.result()
100
+ else:
101
+ return loop.run_until_complete(coro)
102
+
103
+ async def ingest_document_async(self, file_path: str, file_type: str) -> Dict[str, Any]:
104
+ """MCP Tool: Ingest and process a document"""
105
+ try:
106
+ task_id = str(uuid.uuid4())
107
+ self.processing_status[task_id] = {"status": "processing", "progress": 0}
108
+ result = await self.ingestion_tool.process_document(file_path, file_type, task_id)
109
+ if result.get("success"):
110
+ self.processing_status[task_id] = {"status": "completed", "progress": 100}
111
+ doc_id = result.get("document_id")
112
+ if doc_id:
113
+ doc = await self.document_store.get_document(doc_id)
114
+ if doc:
115
+ self.document_cache[doc_id] = doc
116
+ return result
117
+ else:
118
+ self.processing_status[task_id] = {"status": "failed", "error": result.get("error")}
119
+ return result
120
+ except Exception as e:
121
+ logger.error(f"Document ingestion failed: {str(e)}")
122
+ return {"success": False, "error": str(e), "message": "Failed to process document"}
123
+
124
+ async def get_document_content_async(self, document_id: str) -> Optional[str]:
125
+ """Get document content by ID"""
126
+ try:
127
+ # Check cache first
128
+ if document_id in self.document_cache:
129
+ return self.document_cache[document_id].content
130
+
131
+ # Get from store
132
+ doc = await self.document_store.get_document(document_id)
133
+ if doc:
134
+ self.document_cache[document_id] = doc
135
+ return doc.content
136
+ return None
137
+ except Exception as e:
138
+ logger.error(f"Error getting document content: {str(e)}")
139
+ return None
140
+
141
+ async def semantic_search_async(self, query: str, top_k: int = 5, filters: Optional[Dict] = None) -> Dict[str, Any]:
142
+ """MCP Tool: Perform semantic search"""
143
+ try:
144
+ results = await self.search_tool.search(query, top_k, filters)
145
+ return {"success": True, "query": query, "results": [result.to_dict() for result in results], "total_results": len(results)}
146
+ except Exception as e:
147
+ logger.error(f"Semantic search failed: {str(e)}")
148
+ return {"success": False, "error": str(e), "query": query, "results": []}
149
+
150
+ async def summarize_content_async(self, content: str = None, document_id: str = None, style: str = "concise") -> Dict[str, Any]:
151
+ try:
152
+ if document_id and document_id != "none":
153
+ content = await self.get_document_content_async(document_id)
154
+ if not content:
155
+ return {"success": False, "error": f"Document {document_id} not found"}
156
+ if not content or not content.strip():
157
+ return {"success": False, "error": "No content provided for summarization"}
158
+ max_content_length = 4000
159
+ if len(content) > max_content_length:
160
+ content = content[:max_content_length] + "..."
161
+ summary = await self.generative_tool.summarize(content, style)
162
+ return {"success": True, "summary": summary, "original_length": len(content), "summary_length": len(summary), "style": style, "document_id": document_id}
163
+ except Exception as e:
164
+ logger.error(f"Summarization failed: {str(e)}")
165
+ return {"success": False, "error": str(e)}
166
+
167
+ async def generate_tags_async(self, content: str = None, document_id: str = None, max_tags: int = 5) -> Dict[str, Any]:
168
+ """MCP Tool: Generate tags for content"""
169
+ try:
170
+ if document_id and document_id != "none":
171
+ content = await self.get_document_content_async(document_id)
172
+ if not content:
173
+ return {"success": False, "error": f"Document {document_id} not found"}
174
+ if not content or not content.strip():
175
+ return {" success": False, "error": "No content provided for tag generation"}
176
+ tags = await self.generative_tool.generate_tags(content, max_tags)
177
+ if document_id and document_id != "none" and tags:
178
+ await self.document_store.update_document_metadata(document_id, {"tags": tags})
179
+ return {"success": True, "tags": tags, "content_length": len(content), "document_id": document_id}
180
+ except Exception as e:
181
+ logger.error(f"Tag generation failed: {str(e)}")
182
+ return {"success": False, "error": str(e)}
183
+
184
+ async def generate_podcast_async(
185
+ self,
186
+ document_ids: List[str],
187
+ style: str = "conversational",
188
+ duration_minutes: int = 10,
189
+ host1_voice: str = "Rachel",
190
+ host2_voice: str = "Adam"
191
+ ) -> Dict[str, Any]:
192
+ """Generate podcast from documents"""
193
+ try:
194
+ result = await self.podcast_tool.generate_podcast(
195
+ document_ids=document_ids,
196
+ style=style,
197
+ duration_minutes=duration_minutes,
198
+ host1_voice=host1_voice,
199
+ host2_voice=host2_voice
200
+ )
201
+ return result
202
+ except Exception as e:
203
+ logger.error(f"Podcast generation failed: {str(e)}")
204
+ return {"success": False, "error": str(e)}
205
+
206
+ async def generate_podcast_transcript_async(
207
+ self,
208
+ document_ids: List[str],
209
+ style: str = "conversational",
210
+ duration_minutes: int = 10
211
+ ) -> Dict[str, Any]:
212
+ """Generate podcast transcript without audio"""
213
+ try:
214
+ return await self.podcast_tool.generate_transcript(
215
+ document_ids=document_ids,
216
+ style=style,
217
+ duration_minutes=duration_minutes
218
+ )
219
+ except Exception as e:
220
+ logger.error(f"Transcript generation failed: {str(e)}")
221
+ return {"success": False, "error": str(e)}
222
+
223
+ def list_podcasts_sync(self, limit: int = 10) -> Dict[str, Any]:
224
+ """List generated podcasts"""
225
+ try:
226
+ return self.podcast_tool.list_podcasts(limit)
227
+ except Exception as e:
228
+ logger.error(f"Listing podcasts failed: {str(e)}")
229
+ return {"success": False, "error": str(e)}
230
+
231
+ async def get_podcast_async(self, podcast_id: str) -> Dict[str, Any]:
232
+ """Get podcast metadata"""
233
+ try:
234
+ return self.podcast_tool.get_podcast(podcast_id)
235
+ except Exception as e:
236
+ logger.error(f"Getting podcast failed: {str(e)}")
237
+ return {"success": False, "error": str(e)}
238
+
239
+ async def get_podcast_audio_async(self, podcast_id: str) -> Dict[str, Any]:
240
+ """Get podcast audio path"""
241
+ try:
242
+ return self.podcast_tool.get_podcast_audio(podcast_id)
243
+ except Exception as e:
244
+ logger.error(f"Getting podcast audio failed: {str(e)}")
245
+ return {"success": False, "error": str(e)}
246
+
247
+ async def answer_question_async(self, question: str, context_filter: Optional[Dict] = None) -> Dict[str, Any]:
248
+ try:
249
+ search_results = await self.search_tool.search(question, top_k=5, filters=context_filter)
250
+ if not search_results:
251
+ return {"success": False, "error": "No relevant context found in your documents. Please make sure you have uploaded relevant documents.", "question": question}
252
+ answer = await self.generative_tool.answer_question(question, search_results)
253
+ return {"success": True, "question": question, "answer": answer, "sources": [result.to_dict() for result in search_results], "confidence": "high" if len(search_results) >= 3 else "medium"}
254
+ except Exception as e:
255
+ logger.error(f"Question answering failed: {str(e)}")
256
+ return {"success": False, "error": str(e), "question": question}
257
+
258
+ async def generate_outline_async(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> Dict[str, Any]:
259
+ try:
260
+ outline = await self.generative_tool.generate_outline(topic, num_sections, detail_level)
261
+ return {"success": True, "result": outline}
262
+ except Exception as e:
263
+ return {"success": False, "error": str(e)}
264
+
265
+ async def explain_concept_async(self, concept: str, audience: str = "general", length: str = "medium") -> Dict[str, Any]:
266
+ try:
267
+ explanation = await self.generative_tool.explain_concept(concept, audience, length)
268
+ return {"success": True, "result": explanation}
269
+ except Exception as e:
270
+ return {"success": False, "error": str(e)}
271
+
272
+ async def paraphrase_text_async(self, text: str, style: str = "formal") -> Dict[str, Any]:
273
+ try:
274
+ paraphrase = await self.generative_tool.paraphrase_text(text, style)
275
+ return {"success": True, "result": paraphrase}
276
+ except Exception as e:
277
+ return {"success": False, "error": str(e)}
278
+
279
+ async def categorize_content_async(self, content: str, categories: List[str]) -> Dict[str, Any]:
280
+ try:
281
+ category = await self.generative_tool.categorize(content, categories)
282
+ return {"success": True, "result": category}
283
+ except Exception as e:
284
+ return {"success": False, "error": str(e)}
285
+
286
+ async def extract_key_insights_async(self, content: str, num_insights: int = 5) -> Dict[str, Any]:
287
+ try:
288
+ insights = await self.generative_tool.extract_key_insights(content, num_insights)
289
+ return {"success": True, "result": "\n".join([f"- {insight}" for insight in insights])}
290
+ except Exception as e:
291
+ return {"success": False, "error": str(e)}
292
+
293
+ async def generate_questions_async(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> Dict[str, Any]:
294
+ try:
295
+ questions = await self.generative_tool.generate_questions(content, question_type, num_questions)
296
+ return {"success": True, "result": "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])}
297
+ except Exception as e:
298
+ return {"success": False, "error": str(e)}
299
+
300
+ async def extract_key_information_async(self, content: str) -> Dict[str, Any]:
301
+ try:
302
+ info = await self.llm_service.extract_key_information(content)
303
+ return {"success": True, "result": json.dumps(info, indent=2)}
304
+ except Exception as e:
305
+ return {"success": False, "error": str(e)}
306
+
307
+ def list_documents_sync(self, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
308
+ try:
309
+ documents = self.run_async(self.document_store.list_documents(limit, offset))
310
+ return {"success": True, "documents": [doc.to_dict() for doc in documents], "total": len(documents)}
311
+ except Exception as e:
312
+ return {"success": False, "error": str(e)}
313
+
314
+ mcp_server = ContentOrganizerMCPServer()
315
+ try:
316
+ print("⏳ Initializing LlamaIndex Service...")
317
+ # Use the fixed run_async method to safely initialize
318
+ mcp_server.run_async(mcp_server.llamaindex_service.initialize())
319
+ print("✅ LlamaIndex Initialized Successfully!")
320
+ except Exception as e:
321
+ print(f"⚠️ Warning during LlamaIndex init: {e}")
322
+
323
+
324
+ async def generate_podcast_transcript(
325
+ document_ids: List[str],
326
+ style: str = "conversational",
327
+ duration_minutes: int = 10
328
+ ) -> Dict[str, Any]:
329
+ """Generate podcast transcript without audio"""
330
+ return await mcp_server.generate_podcast_transcript_async(document_ids, style, duration_minutes)
331
+
332
+ async def list_podcasts(limit: int = 10) -> Dict[str, Any]:
333
+ """List generated podcasts"""
334
+ return mcp_server.list_podcasts_sync(limit)
335
+
336
+ async def get_podcast(podcast_id: str) -> Dict[str, Any]:
337
+ """Get podcast metadata"""
338
+ return await mcp_server.get_podcast_async(podcast_id)
339
+
340
+ async def get_podcast_audio(podcast_id: str) -> Dict[str, Any]:
341
+ """Get podcast audio path"""
342
+ return await mcp_server.get_podcast_audio_async(podcast_id)
343
+
344
+ def get_document_list():
345
+ try:
346
+ result = mcp_server.list_documents_sync(limit=100)
347
+ if result["success"]:
348
+ if result["documents"]:
349
+ doc_list_str = "📚 Documents in Library:\n\n"
350
+ for i, doc_item in enumerate(result["documents"], 1):
351
+ doc_list_str += f"{i}. {doc_item['filename']} (ID: {doc_item['id'][:8]}...)\n"
352
+ doc_list_str += f" Type: {doc_item['doc_type']}, Size: {doc_item['file_size']} bytes\n"
353
+ if doc_item.get('tags'):
354
+ doc_list_str += f" Tags: {', '.join(doc_item['tags'])}\n"
355
+ doc_list_str += f" Created: {doc_item['created_at'][:10]}\n\n"
356
+ return doc_list_str
357
+ else:
358
+ return "No documents in library yet. Upload some documents to get started!"
359
+ else:
360
+ return f"Error loading documents: {result['error']}"
361
+ except Exception as e:
362
+ return f"Error: {str(e)}"
363
+
364
+ def get_document_choices():
365
+ try:
366
+ result = mcp_server.list_documents_sync(limit=100)
367
+ if result["success"] and result["documents"]:
368
+ choices = [(f"{doc['filename']} ({doc['id'][:8]}...)", doc['id']) for doc in result["documents"]]
369
+ logger.info(f"Generated {len(choices)} document choices")
370
+ return choices
371
+ return []
372
+ except Exception as e:
373
+ logger.error(f"Error getting document choices: {str(e)}")
374
+ return []
375
+
376
+ def refresh_library():
377
+ """Refresh library and update all document selectors"""
378
+ doc_list_refreshed = get_document_list()
379
+ doc_choices_refreshed = get_document_choices()
380
+ logger.info(f"Refreshing library. Found {len(doc_choices_refreshed)} choices.")
381
+ return (
382
+ doc_list_refreshed,
383
+ gr.update(choices=doc_choices_refreshed),
384
+ gr.update(choices=doc_choices_refreshed),
385
+ gr.update(choices=doc_choices_refreshed),
386
+ gr.update(choices=doc_choices_refreshed)
387
+ )
388
+
389
+ def upload_and_process_file(file):
390
+ """
391
+ Upload and process a document (PDF, TXT, DOCX, or images) into the AI Digital Library.
392
+ Extracts text, generates embeddings, and indexes for semantic search.
393
+
394
+ Args:
395
+ file: Document file to upload and process
396
+
397
+ Returns:
398
+ Processing status, document ID, and updated library information
399
+ """
400
+ if file is None:
401
+ doc_list_initial = get_document_list()
402
+ doc_choices_initial = get_document_choices()
403
+ return (
404
+ "No file uploaded",
405
+ "",
406
+ doc_list_initial,
407
+ gr.update(choices=doc_choices_initial), # delete_doc_dropdown_visible
408
+ gr.update(choices=doc_choices_initial), # doc_dropdown_content
409
+ gr.update(choices=doc_choices_initial), # doc_dropdown_tag_visible
410
+ gr.update(choices=doc_choices_initial) # podcast_doc_selector (ADD THIS!)
411
+ )
412
+ try:
413
+ file_path = file.name if hasattr(file, 'name') else str(file)
414
+ file_type = Path(file_path).suffix.lower().strip('.')
415
+ logger.info(f"Processing file: {file_path}, type: {file_type}")
416
+ result = mcp_server.run_async(mcp_server.ingest_document_async(file_path, file_type))
417
+ if result["success"]:
418
+ logger.info("Syncing LlamaIndex with new document...")
419
+ mcp_server.run_async(mcp_server.llamaindex_service.sync_on_demand())
420
+
421
+ doc_list_updated = get_document_list()
422
+ doc_choices_updated = get_document_choices()
423
+
424
+ if result["success"]:
425
+ return (
426
+ f"✅ Success: {result['message']}\nDocument ID: {result['document_id']}\nChunks created: {result['chunks_created']}",
427
+ result["document_id"],
428
+ doc_list_updated,
429
+ gr.update(choices=doc_choices_updated), # delete_doc_dropdown_visible
430
+ gr.update(choices=doc_choices_updated), # doc_dropdown_content
431
+ gr.update(choices=doc_choices_updated), # doc_dropdown_tag_visible
432
+ gr.update(choices=doc_choices_updated) # podcast_doc_selector (ADD THIS!)
433
+ )
434
+ else:
435
+ return (
436
+ f"❌ Error: {result.get('error', 'Unknown error')}",
437
+ "",
438
+ doc_list_updated,
439
+ gr.update(choices=doc_choices_updated), # delete_doc_dropdown_visible
440
+ gr.update(choices=doc_choices_updated), # doc_dropdown_content
441
+ gr.update(choices=doc_choices_updated), # doc_dropdown_tag_visible
442
+ gr.update(choices=doc_choices_updated) # podcast_doc_selector (ADD THIS!)
443
+ )
444
+ except Exception as e:
445
+ logger.error(f"Error processing file: {str(e)}")
446
+ doc_list_error = get_document_list()
447
+ doc_choices_error = get_document_choices()
448
+ return (
449
+ f"❌ Error: {str(e)}",
450
+ "",
451
+ doc_list_error,
452
+ gr.update(choices=doc_choices_error), # delete_doc_dropdown_visible
453
+ gr.update(choices=doc_choices_error), # doc_dropdown_content
454
+ gr.update(choices=doc_choices_error), # doc_dropdown_tag_visible
455
+ gr.update(choices=doc_choices_error) # podcast_doc_selector (ADD THIS!)
456
+ )
457
+
458
+ def perform_search(query, top_k):
459
+ """
460
+ Search through all uploaded documents using semantic search.
461
+ Finds relevant content based on meaning, not just keywords.
462
+
463
+ Args:
464
+ query: Natural language search query
465
+ top_k: Number of results to return (1-20)
466
+
467
+ Returns:
468
+ Search results with relevance scores and source documents
469
+ """
470
+ if not query.strip():
471
+ return "Please enter a search query"
472
+ try:
473
+ result = mcp_server.run_async(mcp_server.semantic_search_async(query, int(top_k)))
474
+ if result["success"]:
475
+ if result["results"]:
476
+ output_str = f"🔍 Found {result['total_results']} results for: '{query}'\n\n"
477
+ for i, res_item in enumerate(result["results"], 1):
478
+ output_str += f"Result {i}:\n"
479
+ output_str += f"📊 Relevance Score: {res_item['score']:.3f}\n"
480
+ output_str += f"📄 Content: {res_item['content'][:300]}...\n"
481
+ if 'document_filename' in res_item.get('metadata', {}):
482
+ output_str += f"📁 Source: {res_item['metadata']['document_filename']}\n"
483
+ output_str += f"🔗 Document ID: {res_item.get('document_id', 'Unknown')}\n"
484
+ output_str += "-" * 80 + "\n\n"
485
+ return output_str
486
+ else:
487
+ return f"No results found for: '{query}'\n\nMake sure you have uploaded relevant documents first."
488
+ else:
489
+ return f"❌ Search failed: {result['error']}"
490
+ except Exception as e:
491
+ logger.error(f"Search error: {str(e)}")
492
+ return f"❌ Error: {str(e)}"
493
+
494
+ def update_options_visibility(task):
495
+ """Update visibility of options based on selected task"""
496
+ return (
497
+ gr.update(visible=task == "Summarize"),
498
+ gr.update(visible=task == "Generate Outline"),
499
+ gr.update(visible=task == "Generate Outline"),
500
+ gr.update(visible=task == "Explain Concept"),
501
+ gr.update(visible=task == "Explain Concept"),
502
+ gr.update(visible=task == "Paraphrase"),
503
+ gr.update(visible=task == "Categorize"),
504
+ gr.update(visible=task in ["Key Insights", "Generate Questions"]),
505
+ gr.update(visible=task == "Generate Questions")
506
+ )
507
+
508
+ def execute_content_task(task, doc_choice, custom_text,
509
+ summary_style, outline_sections, outline_detail,
510
+ explain_audience, explain_length,
511
+ paraphrase_style, categories_input,
512
+ num_items, question_type):
513
+ try:
514
+ content = ""
515
+ if custom_text and custom_text.strip():
516
+ content = custom_text
517
+ elif doc_choice and doc_choice != "none":
518
+ content = mcp_server.run_async(mcp_server.get_document_content_async(doc_choice))
519
+ if not content:
520
+ return "❌ Error: Document not found or empty"
521
+ else:
522
+ if task == "Generate Outline":
523
+ content = custom_text
524
+ else:
525
+ return "⚠️ Please select a document or enter text"
526
+
527
+ result = {"success": False, "error": "Unknown task"}
528
+
529
+ if task == "Summarize":
530
+ result = mcp_server.run_async(mcp_server.summarize_content_async(content=content, style=summary_style))
531
+ if result["success"]:
532
+ return f"📝 Summary ({summary_style}):\n\n{result['summary']}"
533
+
534
+ elif task == "Generate Outline":
535
+ result = mcp_server.run_async(mcp_server.generate_outline_async(content, int(outline_sections), outline_detail))
536
+ if result["success"]:
537
+ return f"📝 Outline for '{content}':\n\n{result['result']}"
538
+
539
+ elif task == "Explain Concept":
540
+ result = mcp_server.run_async(mcp_server.explain_concept_async(content, explain_audience, explain_length))
541
+ if result["success"]:
542
+ return f"💡 Explanation ({explain_audience}):\n\n{result['result']}"
543
+
544
+ elif task == "Paraphrase":
545
+ result = mcp_server.run_async(mcp_server.paraphrase_text_async(content, paraphrase_style))
546
+ if result["success"]:
547
+ return f"🔄 Paraphrased Text ({paraphrase_style}):\n\n{result['result']}"
548
+
549
+ elif task == "Categorize":
550
+ categories = [c.strip() for c in categories_input.split(',')] if categories_input else []
551
+ result = mcp_server.run_async(mcp_server.categorize_content_async(content, categories))
552
+ if result["success"]:
553
+ return f"🏷️ Category:\n\n{result['result']}"
554
+
555
+ elif task == "Key Insights":
556
+ result = mcp_server.run_async(mcp_server.extract_key_insights_async(content, int(num_items)))
557
+ if result["success"]:
558
+ return f"🔍 Key Insights:\n\n{result['result']}"
559
+
560
+ elif task == "Generate Questions":
561
+ result = mcp_server.run_async(mcp_server.generate_questions_async(content, question_type, int(num_items)))
562
+ if result["success"]:
563
+ return f"❓ Generated Questions ({question_type}):\n\n{result['result']}"
564
+
565
+ elif task == "Extract Key Info":
566
+ result = mcp_server.run_async(mcp_server.extract_key_information_async(content))
567
+ if result["success"]:
568
+ return f"📊 Key Information:\n\n{result['result']}"
569
+
570
+ if not result["success"]:
571
+ return f"❌ Error: {result.get('error', 'Unknown error')}"
572
+
573
+ return "✅ Task completed"
574
+
575
+ except Exception as e:
576
+ logger.error(f"Task execution error: {str(e)}")
577
+ return f"❌ Error: {str(e)}"
578
+
579
+ def generate_tags_for_document(doc_choice, custom_text, max_tags):
580
+ """
581
+ Generate relevant tags for a document or custom text using AI.
582
+ Tags are automatically saved to the document metadata.
583
+
584
+ Args:
585
+ doc_choice: Document ID to generate tags for (or None)
586
+ custom_text: Custom text to generate tags from (if no document selected)
587
+ max_tags: Maximum number of tags to generate (3-15)
588
+
589
+ Returns:
590
+ Generated tags and statistics
591
+ """
592
+ try:
593
+ logger.info(f"Generate tags called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
594
+ document_id = doc_choice if doc_choice and doc_choice != "none" and doc_choice != "" else None
595
+
596
+ if custom_text and custom_text.strip():
597
+ logger.info("Using custom text for tag generation")
598
+ result = mcp_server.run_async(mcp_server.generate_tags_async(content=custom_text, max_tags=int(max_tags)))
599
+ elif document_id:
600
+ logger.info(f"Generating tags for document: {document_id}")
601
+ result = mcp_server.run_async(mcp_server.generate_tags_async(document_id=document_id, max_tags=int(max_tags)))
602
+ else:
603
+ return "Please select a document from the dropdown or enter text to generate tags"
604
+
605
+ if result["success"]:
606
+ tags_str = ", ".join(result["tags"])
607
+ output_str = f"🏷️ Generated Tags:\n\n{tags_str}\n\n"
608
+ output_str += f"📊 Statistics:\n"
609
+ output_str += f"- Content length: {result['content_length']} characters\n"
610
+ output_str += f"- Number of tags: {len(result['tags'])}\n"
611
+ if result.get('document_id'):
612
+ output_str += f"- Document ID: {result['document_id']}\n"
613
+ output_str += f"\n✅ Tags have been saved to the document."
614
+ return output_str
615
+ else:
616
+ return f"❌ Tag generation failed: {result['error']}"
617
+ except Exception as e:
618
+ logger.error(f"Tag generation error: {str(e)}")
619
+ return f"❌ Error: {str(e)}"
620
+
621
+ def ask_question(question):
622
+ """
623
+ Ask questions about your uploaded documents using RAG (Retrieval Augmented Generation).
624
+ The AI searches through documents to find relevant context and provides comprehensive answers.
625
+
626
+ Args:
627
+ question: Natural language question about your documents
628
+
629
+ Returns:
630
+ AI-generated answer with source documents and confidence level
631
+ """
632
+ if not question.strip():
633
+ return "Please enter a question"
634
+ try:
635
+ result = mcp_server.run_async(mcp_server.answer_question_async(question))
636
+ if result["success"]:
637
+ output_str = f"❓ Question: {result['question']}\n\n"
638
+ output_str += f"💡 Answer:\n{result['answer']}\n\n"
639
+ output_str += f"🎯 Confidence: {result['confidence']}\n\n"
640
+ output_str += f"📚 Sources Used ({len(result['sources'])}):\n"
641
+ for i, source_item in enumerate(result['sources'], 1):
642
+ filename = source_item.get('metadata', {}).get('document_filename', 'Unknown')
643
+ output_str += f"\n{i}. 📄 {filename}\n"
644
+ output_str += f" 📝 Excerpt: {source_item['content'][:150]}...\n"
645
+ output_str += f" 📊 Relevance: {source_item['score']:.3f}\n"
646
+ return output_str
647
+ else:
648
+ return f"❌ {result.get('error', 'Failed to answer question')}"
649
+ except Exception as e:
650
+ return f"❌ Error: {str(e)}"
651
+
652
+ def delete_document_from_library(document_id):
653
+ if not document_id:
654
+ doc_list_current = get_document_list()
655
+ doc_choices_current = get_document_choices()
656
+ return (
657
+ "No document selected to delete.",
658
+ doc_list_current,
659
+ gr.update(choices=doc_choices_current), # delete_doc_dropdown_visible
660
+ gr.update(choices=doc_choices_current), # doc_dropdown_content
661
+ gr.update(choices=doc_choices_current), # doc_dropdown_tag_visible
662
+ gr.update(choices=doc_choices_current) # podcast_doc_selector (ADD THIS!)
663
+ )
664
+ try:
665
+ delete_doc_store_result = mcp_server.run_async(mcp_server.document_store.delete_document(document_id))
666
+ delete_vec_store_result = mcp_server.run_async(mcp_server.vector_store.delete_document(document_id))
667
+
668
+ msg = ""
669
+ if delete_doc_store_result:
670
+ msg += f"🗑️ Document {document_id[:8]}... deleted from document store. "
671
+ else:
672
+ msg += f"❌ Failed to delete document {document_id[:8]}... from document store. "
673
+
674
+ if delete_vec_store_result:
675
+ msg += "Embeddings deleted from vector store."
676
+ else:
677
+ msg += "Failed to delete embeddings from vector store (or no embeddings existed)."
678
+
679
+ doc_list_updated = get_document_list()
680
+ doc_choices_updated = get_document_choices()
681
+ return (
682
+ msg,
683
+ doc_list_updated,
684
+ gr.update(choices=doc_choices_updated), # delete_doc_dropdown_visible
685
+ gr.update(choices=doc_choices_updated), # doc_dropdown_content
686
+ gr.update(choices=doc_choices_updated), # doc_dropdown_tag_visible
687
+ gr.update(choices=doc_choices_updated) # podcast_doc_selector (ADD THIS!)
688
+ )
689
+ except Exception as e:
690
+ logger.error(f"Error deleting document: {str(e)}")
691
+ doc_list_error = get_document_list()
692
+ doc_choices_error = get_document_choices()
693
+ return (
694
+ f"❌ Error deleting document: {str(e)}",
695
+ doc_list_error,
696
+ gr.update(choices=doc_choices_error), # delete_doc_dropdown_visible
697
+ gr.update(choices=doc_choices_error), # doc_dropdown_content
698
+ gr.update(choices=doc_choices_error), # doc_dropdown_tag_visible
699
+ gr.update(choices=doc_choices_error) # podcast_doc_selector (ADD THIS!)
700
+ )
701
+
702
+ # Voice conversation state
703
+ voice_conversation_state = {
704
+ "session_id": None,
705
+ "active": False,
706
+ "transcript": []
707
+ }
708
+
709
+ # voice_conversation_state = {
710
+ # "session_id": None,
711
+ # "active": False,
712
+ # "transcript": []
713
+ # }
714
+
715
+ # def start_voice_conversation():
716
+ # """
717
+ # Start a new voice conversation session
718
+
719
+ # Returns:
720
+ # Tuple of (status_message, start_button_state, stop_button_state, chatbot_history)
721
+ # """
722
+ # try:
723
+ # # Check if service is available
724
+ # if not mcp_server.elevenlabs_service.is_available():
725
+ # return (
726
+ # "⚠️ Voice assistant not configured.\n\n"
727
+ # "**Setup Instructions:**\n"
728
+ # "1. Get API key from: https://elevenlabs.io/app/settings/api-keys\n"
729
+ # "2. Create an agent at: https://elevenlabs.io/app/conversational-ai\n"
730
+ # "3. Add to .env file:\n"
731
+ # " - ELEVENLABS_API_KEY=your_api_key\n"
732
+ # " - ELEVENLABS_AGENT_ID=your_agent_id\n"
733
+ # "4. Restart the application",
734
+ # gr.update(interactive=True), # start button enabled
735
+ # gr.update(interactive=False), # stop button disabled
736
+ # []
737
+ # )
738
+
739
+ # # Create new session
740
+ # session_id = str(uuid.uuid4())
741
+ # result = mcp_server.run_async(
742
+ # mcp_server.elevenlabs_service.start_conversation(session_id)
743
+ # )
744
+
745
+ # if result.get("success"):
746
+ # voice_conversation_state["session_id"] = session_id
747
+ # voice_conversation_state["active"] = True
748
+ # voice_conversation_state["transcript"] = []
749
+
750
+ # # Initialize chatbot with welcome message
751
+ # initial_message = {
752
+ # "role": "assistant",
753
+ # "content": "👋 Hello! I'm your AI librarian. Ask me anything about your documents!"
754
+ # }
755
+
756
+ # return (
757
+ # "✅ Voice assistant is ready!\n\n"
758
+ # "You can now ask questions about your uploaded documents.",
759
+ # gr.update(interactive=False), # start button disabled
760
+ # gr.update(interactive=True), # stop button enabled
761
+ # [initial_message]
762
+ # )
763
+ # else:
764
+ # error_msg = result.get("error", "Unknown error")
765
+ # return (
766
+ # f"❌ Failed to start: {error_msg}\n\n"
767
+ # "**Troubleshooting:**\n"
768
+ # "• Check your API key is valid\n"
769
+ # "• Verify agent ID is correct\n"
770
+ # "• Check internet connection",
771
+ # gr.update(interactive=True),
772
+ # gr.update(interactive=False),
773
+ # []
774
+ # )
775
+ # except Exception as e:
776
+ # logger.error(f"Error starting voice conversation: {str(e)}", exc_info=True)
777
+ # return (
778
+ # f"❌ Error: {str(e)}",
779
+ # gr.update(interactive=True),
780
+ # gr.update(interactive=False),
781
+ # []
782
+ # )
783
+
784
+ # def stop_voice_conversation():
785
+ # """
786
+ # Stop active voice conversation
787
+
788
+ # Returns:
789
+ # Tuple of (status_message, start_button_state, stop_button_state, chatbot_history)
790
+ # """
791
+ # try:
792
+ # if not voice_conversation_state["active"]:
793
+ # return (
794
+ # "ℹ️ No active conversation",
795
+ # gr.update(interactive=True),
796
+ # gr.update(interactive=False),
797
+ # voice_conversation_state["transcript"]
798
+ # )
799
+
800
+ # session_id = voice_conversation_state["session_id"]
801
+ # if session_id:
802
+ # mcp_server.run_async(
803
+ # mcp_server.elevenlabs_service.end_conversation(session_id)
804
+ # )
805
+
806
+ # # Get conversation stats
807
+ # message_count = len(voice_conversation_state["transcript"])
808
+
809
+ # voice_conversation_state["active"] = False
810
+ # voice_conversation_state["session_id"] = None
811
+
812
+ # return (
813
+ # f"✅ Conversation ended\n\n"
814
+ # f"📊 Stats: {message_count} messages exchanged",
815
+ # gr.update(interactive=True),
816
+ # gr.update(interactive=False),
817
+ # voice_conversation_state["transcript"]
818
+ # )
819
+ # except Exception as e:
820
+ # logger.error(f"Error stopping conversation: {str(e)}")
821
+ # return (
822
+ # f"❌ Error: {str(e)}",
823
+ # gr.update(interactive=True),
824
+ # gr.update(interactive=False),
825
+ # voice_conversation_state["transcript"]
826
+ # )
827
+
828
+ # def send_voice_message_v6(message, chat_history):
829
+ # """
830
+ # Send message in voice conversation - Gradio 6+ format
831
+
832
+ # Args:
833
+ # message: User's text message
834
+ # chat_history: Current chat history (list of message dicts)
835
+
836
+ # Returns:
837
+ # Tuple of (updated_chat_history, cleared_input_box)
838
+ # """
839
+ # try:
840
+ # # Validate state
841
+ # if not voice_conversation_state["active"]:
842
+ # chat_history.append({
843
+ # "role": "assistant",
844
+ # "content": "⚠️ Please start a conversation first by clicking 'Start Conversation'"
845
+ # })
846
+ # return chat_history, ""
847
+
848
+ # # Validate input
849
+ # if not message or not message.strip():
850
+ # return chat_history, message
851
+
852
+ # session_id = voice_conversation_state["session_id"]
853
+
854
+ # # Add user message to display
855
+ # chat_history.append({
856
+ # "role": "user",
857
+ # "content": message
858
+ # })
859
+
860
+ # # Show typing indicator
861
+ # chat_history.append({
862
+ # "role": "assistant",
863
+ # "content": "🤔 Thinking..."
864
+ # })
865
+
866
+ # # Get AI response
867
+ # result = mcp_server.run_async(
868
+ # mcp_server.voice_tool.voice_qa(message, session_id)
869
+ # )
870
+
871
+ # # Remove typing indicator
872
+ # chat_history = chat_history[:-1]
873
+
874
+ # # Add response
875
+ # if result.get("success"):
876
+ # answer = result.get("answer", "No response")
877
+
878
+ # # Add helpful context if RAG was used
879
+ # if "document" in answer.lower() or "file" in answer.lower():
880
+ # footer = "\n\n💡 *Answer based on your documents*"
881
+ # else:
882
+ # footer = ""
883
+
884
+ # chat_history.append({
885
+ # "role": "assistant",
886
+ # "content": answer + footer
887
+ # })
888
+ # else:
889
+ # error_msg = result.get("error", "Unknown error")
890
+ # chat_history.append({
891
+ # "role": "assistant",
892
+ # "content": f"❌ Error: {error_msg}\n\n"
893
+ # "**Suggestions:**\n"
894
+ # "• Try rephrasing your question\n"
895
+ # "• Make sure you have uploaded relevant documents\n"
896
+ # "• Check if the question is about your document library"
897
+ # })
898
+
899
+ # # Update conversation state
900
+ # voice_conversation_state["transcript"] = chat_history
901
+
902
+ # return chat_history, ""
903
+
904
+ # except Exception as e:
905
+ # logger.error(f"Error in voice message: {str(e)}", exc_info=True)
906
+
907
+ # # Remove typing indicator if present
908
+ # if chat_history and chat_history[-1]["role"] == "assistant" and "Thinking" in chat_history[-1]["content"]:
909
+ # chat_history = chat_history[:-1]
910
+
911
+ # chat_history.append({
912
+ # "role": "assistant",
913
+ # "content": f"❌ An error occurred: {str(e)}\n\nPlease try again."
914
+ # })
915
+ # return chat_history, ""
916
+
917
+ # def test_voice_connection():
918
+ # """
919
+ # Test voice assistant connection
920
+
921
+ # Returns:
922
+ # Status message with test results
923
+ # """
924
+ # try:
925
+ # result = mcp_server.run_async(
926
+ # mcp_server.voice_tool.test_connection()
927
+ # )
928
+
929
+ # if result.get("success"):
930
+ # return (
931
+ # "✅ **Connection Test Passed**\n\n"
932
+ # f"• API Status: Connected\n"
933
+ # f"• Voices Available: {result.get('voices_available', 0)}\n"
934
+ # f"• RAG Tool: {'✓ Working' if result.get('rag_tool_working') else '✗ Failed'}\n"
935
+ # f"• Client Tools: {'✓ Registered' if result.get('client_tools_registered') else '✗ Not Registered'}\n\n"
936
+ # "🎉 Voice assistant is ready to use!"
937
+ # )
938
+ # else:
939
+ # return (
940
+ # "❌ **Connection Test Failed**\n\n"
941
+ # f"Error: {result.get('message', 'Unknown error')}\n\n"
942
+ # "**Troubleshooting:**\n"
943
+ # "1. Verify ELEVENLABS_API_KEY in .env\n"
944
+ # "2. Check ELEVENLABS_AGENT_ID is set\n"
945
+ # "3. Ensure API key is valid\n"
946
+ # "4. Check internet connection"
947
+ # )
948
+ # except Exception as e:
949
+ # logger.error(f"Connection test error: {str(e)}")
950
+ # return (
951
+ # f"❌ **Test Error**\n\n{str(e)}\n\n"
952
+ # "Please check your configuration and try again."
953
+ # )
954
+
955
+ # def get_conversation_stats():
956
+ # """
957
+ # Get statistics about current conversation
958
+
959
+ # Returns:
960
+ # Formatted stats string
961
+ # """
962
+ # try:
963
+ # if not voice_conversation_state["active"]:
964
+ # return "ℹ️ No active conversation"
965
+
966
+ # transcript = voice_conversation_state["transcript"]
967
+ # user_msgs = sum(1 for msg in transcript if msg["role"] == "user")
968
+ # ai_msgs = sum(1 for msg in transcript if msg["role"] == "assistant")
969
+
970
+ # return (
971
+ # "📊 **Conversation Statistics**\n\n"
972
+ # f"• Session ID: {voice_conversation_state['session_id'][:8]}...\n"
973
+ # f"• Your messages: {user_msgs}\n"
974
+ # f"• AI responses: {ai_msgs}\n"
975
+ # f"• Total exchanges: {user_msgs}\n"
976
+ # f"• Status: {'🟢 Active' if voice_conversation_state['active'] else '🔴 Inactive'}"
977
+ # )
978
+ # except Exception as e:
979
+ # logger.error(f"Error getting stats: {str(e)}")
980
+ # return f"❌ Error: {str(e)}"
981
+
982
+ def generate_podcast_ui(doc_ids, style, duration, voice1, voice2):
983
+ """UI wrapper for podcast generation (EXISTING FUNCTION - keep as is)"""
984
+ try:
985
+ logger.info(f"generate_podcast_ui called with:")
986
+ logger.info(f" doc_ids: {doc_ids} (type: {type(doc_ids)})")
987
+ logger.info(f" style: {style}")
988
+ logger.info(f" duration: {duration}")
989
+ logger.info(f" voice1: {voice1}")
990
+ logger.info(f" voice2: {voice2}")
991
+
992
+ if doc_ids is None:
993
+ logger.warning("doc_ids is None")
994
+ return ("⚠️ Please select at least one document", None, "No documents selected", "")
995
+
996
+ if isinstance(doc_ids, str):
997
+ logger.info(f"Converting string doc_id to list: {doc_ids}")
998
+ doc_ids = [doc_ids]
999
+
1000
+ if not doc_ids or len(doc_ids) == 0:
1001
+ logger.warning(f"doc_ids is empty or has length 0: {doc_ids}")
1002
+ return ("⚠️ Please select at least one document", None, "No documents selected", "")
1003
+
1004
+ doc_ids = [doc_id for doc_id in doc_ids if doc_id and doc_id.strip()]
1005
+
1006
+ if not doc_ids:
1007
+ logger.warning("After filtering, no valid doc_ids remain")
1008
+ return ("⚠️ Please select at least one document", None, "No documents selected", "")
1009
+
1010
+ logger.info(f"Generating podcast with {len(doc_ids)} valid documents: {doc_ids}")
1011
+
1012
+ result = mcp_server.run_async(
1013
+ mcp_server.generate_podcast_async(
1014
+ document_ids=doc_ids,
1015
+ style=style,
1016
+ duration_minutes=int(duration),
1017
+ host1_voice=voice1,
1018
+ host2_voice=voice2
1019
+ )
1020
+ )
1021
+
1022
+ logger.info(f"Podcast generation result: success={result.get('success')}")
1023
+
1024
+ if result.get("success"):
1025
+ audio_file = result.get("audio_file")
1026
+ transcript = result.get("transcript", "Transcript not available")
1027
+ message = result.get("message", "Podcast generated!")
1028
+ formatted_transcript = f"## Podcast Transcript\n\n{transcript}"
1029
+
1030
+ logger.info(f"Podcast generated successfully: {audio_file}")
1031
+
1032
+ return (
1033
+ f"✅ {message}",
1034
+ audio_file,
1035
+ formatted_transcript,
1036
+ result.get("podcast_id", "")
1037
+ )
1038
+ else:
1039
+ error = result.get("error", "Unknown error")
1040
+ logger.error(f"Podcast generation failed: {error}")
1041
+ return (f"❌ Error: {error}", None, "Generation failed", "")
1042
+ except Exception as e:
1043
+ logger.error(f"Podcast UI error: {str(e)}", exc_info=True)
1044
+ return (f"❌ Error: {str(e)}", None, "An error occurred", "")
1045
+
1046
+ def load_dashboard_stats():
1047
+ """Load dashboard statistics for the UI"""
1048
+ try:
1049
+ docs_result = mcp_server.list_documents_sync(limit=1000)
1050
+ doc_count = 0
1051
+ total_chunks = 0
1052
+ total_size = 0
1053
+ recent_data = []
1054
+
1055
+ if docs_result.get("success"):
1056
+ documents = docs_result.get("documents", [])
1057
+ doc_count = len(documents)
1058
+ total_chunks = sum(doc.get("metadata", {}).get("chunk_count", 0) for doc in documents)
1059
+ total_size = sum(doc.get("file_size", 0) for doc in documents)
1060
+ storage_mb = round(total_size / (1024 * 1024), 2) if total_size > 0 else 0.0
1061
+
1062
+ recent = documents[:5]
1063
+ recent_data = [
1064
+ [
1065
+ doc.get("filename", "Unknown"),
1066
+ doc.get("doc_type", "unknown"),
1067
+ doc.get("created_at", "")[:10] if doc.get("created_at") else "N/A",
1068
+ f"{doc.get('file_size', 0)} bytes"
1069
+ ]
1070
+ for doc in recent
1071
+ ]
1072
+ else:
1073
+ storage_mb = 0.0
1074
+
1075
+ vector_stat = "✅ Online" if getattr(mcp_server, "vector_store", None) else "❌ Offline"
1076
+ llm_stat = "✅ Ready" if getattr(mcp_server, "llm_service", None) else "❌ Offline"
1077
+ voice_stat = "✅ Ready" if (getattr(mcp_server, "elevenlabs_service", None) and mcp_server.elevenlabs_service.is_available()) else "⚠️ Configure API Key"
1078
+
1079
+ return (
1080
+ doc_count,
1081
+ total_chunks,
1082
+ storage_mb,
1083
+ recent_data,
1084
+ vector_stat,
1085
+ llm_stat,
1086
+ voice_stat,
1087
+ )
1088
+ except Exception as e:
1089
+ logger.error(f"Error loading dashboard stats: {str(e)}")
1090
+ return (0, 0, 0.0, [], "❌ Error", "❌ Error", "❌ Error")
1091
+
1092
+ # ADD THESE HELPER FUNCTIONS BEFORE create_gradio_interface():
1093
+ def load_podcast_library_ui():
1094
+ """Load and display podcast library with audio players"""
1095
+ try:
1096
+ result = mcp_server.list_podcasts_sync(limit=50)
1097
+
1098
+ if not result.get("success"):
1099
+ return (
1100
+ gr.Column(visible=False),
1101
+ f"❌ Failed to load podcasts: {result.get('error', 'Unknown error')}"
1102
+ )
1103
+
1104
+ podcasts = result.get("podcasts", [])
1105
+
1106
+ if not podcasts:
1107
+ return (
1108
+ gr.Column(visible=False),
1109
+ "📭 No podcasts generated yet. Create your first podcast above!"
1110
+ )
1111
+
1112
+ # Build the library UI dynamically
1113
+ with gr.Column(visible=True) as library_col:
1114
+ for idx, podcast in enumerate(podcasts, 1):
1115
+ with gr.Group():
1116
+ with gr.Row():
1117
+ # Column 1: Podcast Info (40%)
1118
+ with gr.Column(scale=2):
1119
+ gr.Markdown(f"### 🎙️ Podcast #{idx}")
1120
+
1121
+ # Extract document names from metadata
1122
+ doc_names = []
1123
+ doc_ids = [] # Initialize here to avoid reference error
1124
+
1125
+ if podcast.get("metadata"):
1126
+ doc_ids = podcast["metadata"].get("document_ids", [])
1127
+ # Get document names
1128
+ for doc_id in doc_ids:
1129
+ try:
1130
+ doc = mcp_server.run_async(
1131
+ mcp_server.document_store.get_document(doc_id)
1132
+ )
1133
+ if doc:
1134
+ doc_names.append(doc.filename)
1135
+ except Exception as e:
1136
+ logger.warning(f"Could not fetch document {doc_id}: {e}")
1137
+ doc_names.append(f"Doc {doc_id[:8]}...")
1138
+
1139
+ # Display document names
1140
+ if doc_names:
1141
+ gr.Markdown(f"**📄 Documents:** {', '.join(doc_names)}")
1142
+ else:
1143
+ doc_count = len(doc_ids) if doc_ids else 'N/A'
1144
+ gr.Markdown(f"**📄 Documents:** {doc_count} document(s)")
1145
+
1146
+ # Podcast metadata
1147
+ style = podcast.get("metadata", {}).get("style", "Unknown")
1148
+ duration = podcast.get("metadata", {}).get("duration_minutes", "N/A")
1149
+ created = podcast.get("created_at", "Unknown")[:19] if podcast.get("created_at") else "Unknown"
1150
+ # Safe ID handling
1151
+ podcast_id = podcast.get("id") or podcast.get("podcast_id") or "Unknown"
1152
+ if isinstance(podcast_id, str) and len(podcast_id) > 16:
1153
+ display_id = f"{podcast_id[:16]}..."
1154
+ else:
1155
+ display_id = podcast_id
1156
+
1157
+ gr.Markdown(
1158
+ f"**🎨 Style:** {style.title()} \n"
1159
+ f"**⏱️ Duration:** ~{duration} min \n"
1160
+ f"**📅 Created:** {created} \n"
1161
+ f"**🆔 ID:** `{display_id}`"
1162
+ )
1163
+
1164
+
1165
+ # Column 2: Audio Player (60%)
1166
+ with gr.Column(scale=3):
1167
+ audio_file = podcast.get("audio_file")
1168
+
1169
+ if audio_file and os.path.exists(audio_file):
1170
+ gr.Audio(
1171
+ value=audio_file,
1172
+ type="filepath",
1173
+ interactive=False,
1174
+ show_label=False,
1175
+ show_download_button=True,
1176
+ waveform_options={"show_controls": True}
1177
+ )
1178
+ else:
1179
+ gr.Markdown("⚠️ *Audio file not found*")
1180
+ if audio_file:
1181
+ gr.Markdown(f"*Expected path: {audio_file}*")
1182
+
1183
+ # Optional: Show transcript in accordion
1184
+ with gr.Accordion(f"📝 View Transcript", open=False):
1185
+ transcript = podcast.get("transcript", "Transcript not available")
1186
+ gr.Markdown(transcript)
1187
+
1188
+ status_msg = f"✅ Loaded {len(podcasts)} podcast{'s' if len(podcasts) != 1 else ''}"
1189
+ return library_col, status_msg
1190
+
1191
+ except Exception as e:
1192
+ logger.error(f"Error loading podcast library: {str(e)}", exc_info=True)
1193
+ return (
1194
+ gr.Column(visible=False),
1195
+ f"❌ Error loading library: {str(e)}"
1196
+ )
1197
+
1198
+ def create_gradio_interface():
1199
+ custom_theme = gr.themes.Soft(
1200
+ primary_hue=gr.themes.colors.indigo,
1201
+ secondary_hue=gr.themes.colors.blue,
1202
+ neutral_hue=gr.themes.colors.slate,
1203
+ font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
1204
+ font_mono=[gr.themes.GoogleFont("Fira Code"), "monospace"],
1205
+ ).set(
1206
+ button_primary_background_fill="*primary_500",
1207
+ button_primary_background_fill_hover="*primary_600",
1208
+ block_title_text_weight="600",
1209
+ block_label_text_size="sm",
1210
+ block_label_text_weight="500",
1211
+ )
1212
+
1213
+ # Force dark theme using JavaScript
1214
+ js_func = """
1215
+ function refresh() {
1216
+ const url = new URL(window.location);
1217
+ if (url.searchParams.get('__theme') !== 'dark') {
1218
+ url.searchParams.set('__theme', 'dark');
1219
+ window.location.href = url.href;
1220
+ }
1221
+ }
1222
+ """
1223
+
1224
+ url = "https://cdn-uploads.huggingface.co/production/uploads/66f1712d906c08084995f808/TSJexR45eNpUjHhbHDOag.png"
1225
+ resp = requests.get(url)
1226
+ img = Image.open(BytesIO(resp.content))
1227
+
1228
+ with gr.Blocks(title="🧠 AI Digital Library Assistant", theme=custom_theme, js=js_func) as interface:
1229
+ with gr.Tabs():
1230
+ with gr.Tab("🏠 Dashboard"):
1231
+ gr.Markdown("""
1232
+ # 🧠 AI Digital Library Assistant
1233
+ ## Your Intelligent Document Management Platform
1234
+
1235
+ > **MCP 1st Birthday Hackathon Submission**
1236
+ > Transform documents into searchable knowledge • Generate AI podcasts • Voice Q&A • Smart tagging
1237
+
1238
+ ### 🚀 Quick Start Guide
1239
+ 1. **Upload** documents (PDF, Word, images) in the *Upload Documents* tab
1240
+ 2. **Search** your library using natural language in the *Search* tab
1241
+ 3. **Ask questions** and get AI answers with sources in the *Ask Questions* tab
1242
+ 4. **Create content** with summarization, outlines, and more in *Content Studio*
1243
+ 5. **Generate podcasts** from your documents in the *Podcast Studio*
1244
+
1245
+ ### 👤 Author
1246
+ **Hugging Face ID:** [@Nihal2000](https://huggingface.co/Nihal2000)
1247
+
1248
+ ---
1249
+ """)
1250
+ gr.Image(value=img,show_label=False)
1251
+
1252
+ gr.Markdown("## 📊 Quick Stats")
1253
+ with gr.Row():
1254
+ total_docs = gr.Number(
1255
+ label="📚 Total Documents",
1256
+ value=0,
1257
+ interactive=False,
1258
+ container=True
1259
+ )
1260
+ total_chunks = gr.Number(
1261
+ label="🧩 Vector Chunks",
1262
+ value=0,
1263
+ interactive=False,
1264
+ container=True
1265
+ )
1266
+ storage_size = gr.Number(
1267
+ label="💾 Storage (MB)",
1268
+ value=0,
1269
+ interactive=False,
1270
+ container=True
1271
+ )
1272
+
1273
+ gr.Markdown("## 📊 Recent Activity")
1274
+ with gr.Group():
1275
+ recent_docs = gr.Dataframe(
1276
+ headers=["Document", "Type", "Date", "Size"],
1277
+ datatype=["str", "str", "str", "str"],
1278
+ row_count=(5, "fixed"),
1279
+ col_count=(4, "fixed"),
1280
+ interactive=False,
1281
+ label="Recently Added Documents"
1282
+ )
1283
+
1284
+ gr.Markdown("## System Status")
1285
+ with gr.Row():
1286
+ vector_status = gr.Textbox(
1287
+ label="Vector Store",
1288
+ value="✅ Online",
1289
+ interactive=False,
1290
+ container=True
1291
+ )
1292
+ llm_status = gr.Textbox(
1293
+ label="LLM Service",
1294
+ value="✅ Ready",
1295
+ interactive=False,
1296
+ container=True
1297
+ )
1298
+ voice_status = gr.Textbox(
1299
+ label="Voice Service",
1300
+ value="⚠️ Configure API Key",
1301
+ interactive=False,
1302
+ container=True
1303
+ )
1304
+
1305
+ with gr.Tab("📚 Document Library"):
1306
+ with gr.Row():
1307
+ with gr.Column():
1308
+ gr.Markdown("### Your Document Collection")
1309
+ document_list_display = gr.Textbox(label="Documents in Library", value=get_document_list(), lines=20, interactive=False)
1310
+ refresh_btn_library = gr.Button("🔄 Refresh Library", variant="secondary")
1311
+ delete_doc_dropdown_visible = gr.Dropdown(label="Select Document to Delete", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
1312
+ delete_btn = gr.Button("🗑️ Delete Selected Document", variant="stop")
1313
+ delete_output_display = gr.Textbox(label="Delete Status", visible=True)
1314
+
1315
+ with gr.Tab("📄 Upload Documents"):
1316
+ gr.Markdown("""
1317
+ ### 📁 Upload & Process Documents
1318
+
1319
+ Upload PDFs, DOCX, TXT or Images. OCR runs automatically.
1320
+ Once processed, the document becomes searchable and available for summaries, tagging, podcasting and RAG.
1321
+ """)
1322
+
1323
+ with gr.Row(equal_height=True):
1324
+
1325
+ # LEFT SIDE: UPLOAD PANEL
1326
+ with gr.Column(scale=1):
1327
+ with gr.Group():
1328
+ gr.Markdown("#### 📤 Upload File")
1329
+
1330
+ file_input_upload = gr.File(
1331
+ label="Select a document",
1332
+ file_types=[".pdf", ".txt", ".docx", ".png", ".jpg", ".jpeg"],
1333
+ type="filepath",
1334
+ show_label=False
1335
+ )
1336
+
1337
+ gr.HTML("<div style='height:10px'></div>") # spacing
1338
+
1339
+ upload_btn_process = gr.Button(
1340
+ "🚀 Process & Add to Library",
1341
+ variant="primary",
1342
+ size="lg",
1343
+ elem_id="upload-btn"
1344
+ )
1345
+
1346
+ # RIGHT SIDE: STATUS PANEL
1347
+ with gr.Column(scale=1):
1348
+ with gr.Group():
1349
+ gr.Markdown("#### 📦 Processing Preview")
1350
+
1351
+ upload_output_display = gr.Textbox(
1352
+ label="Processing Status",
1353
+ placeholder="⏳ Waiting for upload...",
1354
+ lines=10,
1355
+ interactive=False,
1356
+ )
1357
+
1358
+ doc_id_output_display = gr.Textbox(
1359
+ label="📌 Document ID",
1360
+ placeholder="Will appear after processing...",
1361
+ interactive=False
1362
+ )
1363
+
1364
+
1365
+ # Optional: center button styling
1366
+ gr.HTML("""
1367
+ <style>
1368
+ #upload-btn button {
1369
+ width: 100%;
1370
+ font-size: 18px;
1371
+ padding: 14px;
1372
+ }
1373
+ </style>
1374
+ """)
1375
+
1376
+
1377
+
1378
+ with gr.Tab("📝 Content Studio"):
1379
+ gr.Markdown("""
1380
+ ### 🎨 Create & Analyze Content
1381
+ Transform documents with AI-powered tools: summarize, outline, explain, and more.
1382
+ """)
1383
+
1384
+ with gr.Row():
1385
+ with gr.Column(scale=2):
1386
+ with gr.Group():
1387
+ gr.Markdown("#### 📄 Content Source")
1388
+ doc_dropdown_content = gr.Dropdown(
1389
+ label="Select Document",
1390
+ choices=get_document_choices(),
1391
+ value=None,
1392
+ interactive=True,
1393
+ info="Choose a document from your library"
1394
+ )
1395
+
1396
+ gr.Markdown("**OR**")
1397
+
1398
+ content_text_input = gr.Textbox(
1399
+ label="Enter Text or Topic",
1400
+ placeholder="Paste content or enter a topic...",
1401
+ lines=4,
1402
+ info="For outlines, enter a topic. For other tasks, paste text to analyze."
1403
+ )
1404
+
1405
+ with gr.Group():
1406
+ gr.Markdown("#### 🛠️ Task Configuration")
1407
+ task_dropdown = gr.Dropdown(
1408
+ label="Select Task",
1409
+ choices=[
1410
+ "Summarize", "Generate Outline", "Explain Concept",
1411
+ "Paraphrase", "Categorize", "Key Insights",
1412
+ "Generate Questions", "Extract Key Info"
1413
+ ],
1414
+ value="Summarize",
1415
+ interactive=True,
1416
+ info="Choose the type of analysis to perform"
1417
+ )
1418
+
1419
+ with gr.Accordion("⚙️ Advanced Options", open=False):
1420
+ summary_style_opt = gr.Dropdown(
1421
+ label="Summary Style",
1422
+ choices=["concise", "detailed", "bullet_points", "executive"],
1423
+ value="concise",
1424
+ visible=True,
1425
+ info="How detailed should the summary be?"
1426
+ )
1427
+
1428
+ outline_sections_opt = gr.Slider(
1429
+ label="Number of Sections",
1430
+ minimum=3, maximum=10, value=5, step=1,
1431
+ visible=False,
1432
+ info="How many main sections?"
1433
+ )
1434
+ outline_detail_opt = gr.Dropdown(
1435
+ label="Detail Level",
1436
+ choices=["brief", "medium", "detailed"],
1437
+ value="medium",
1438
+ visible=False
1439
+ )
1440
+
1441
+ explain_audience_opt = gr.Dropdown(
1442
+ label="Target Audience",
1443
+ choices=["general", "technical", "beginner", "expert"],
1444
+ value="general",
1445
+ visible=False,
1446
+ info="Who is this explanation for?"
1447
+ )
1448
+ explain_length_opt = gr.Dropdown(
1449
+ label="Length",
1450
+ choices=["brief", "medium", "detailed"],
1451
+ value="medium",
1452
+ visible=False
1453
+ )
1454
+
1455
+ paraphrase_style_opt = gr.Dropdown(
1456
+ label="Style",
1457
+ choices=["formal", "casual", "academic", "simple", "technical"],
1458
+ value="formal",
1459
+ visible=False,
1460
+ info="Writing style for paraphrasing"
1461
+ )
1462
+
1463
+ categories_input_opt = gr.Textbox(
1464
+ label="Categories (comma separated)",
1465
+ placeholder="Technology, Business, Science...",
1466
+ visible=False
1467
+ )
1468
+
1469
+ num_items_opt = gr.Slider(
1470
+ label="Number of Items",
1471
+ minimum=1, maximum=10, value=5, step=1,
1472
+ visible=False
1473
+ )
1474
+ question_type_opt = gr.Dropdown(
1475
+ label="Question Type",
1476
+ choices=["comprehension", "analysis", "application", "creative", "factual"],
1477
+ value="comprehension",
1478
+ visible=False
1479
+ )
1480
+
1481
+ run_task_btn = gr.Button("🚀 Run Task", variant="primary", size="lg")
1482
+
1483
+ with gr.Column(scale=3):
1484
+ with gr.Group():
1485
+ gr.Markdown("#### 📊 Result")
1486
+ content_output_display = gr.Textbox(
1487
+ label="",
1488
+ lines=25,
1489
+ placeholder="Results will appear here...",
1490
+ show_copy_button=True,
1491
+ container=False
1492
+ )
1493
+
1494
+ task_dropdown.change(
1495
+ fn=update_options_visibility,
1496
+ inputs=[task_dropdown],
1497
+ outputs=[
1498
+ summary_style_opt, outline_sections_opt, outline_detail_opt,
1499
+ explain_audience_opt, explain_length_opt, paraphrase_style_opt,
1500
+ categories_input_opt, num_items_opt, question_type_opt
1501
+ ]
1502
+ )
1503
+
1504
+ run_task_btn.click(
1505
+ fn=execute_content_task,
1506
+ inputs=[
1507
+ task_dropdown, doc_dropdown_content, content_text_input,
1508
+ summary_style_opt, outline_sections_opt, outline_detail_opt,
1509
+ explain_audience_opt, explain_length_opt, paraphrase_style_opt,
1510
+ categories_input_opt, num_items_opt, question_type_opt
1511
+ ],
1512
+ outputs=[content_output_display]
1513
+ )
1514
+
1515
+ # with gr.Tab("🎙️ Voice Assistant"):
1516
+ # # Simple header
1517
+ # gr.Markdown("### Ask questions about your documents using AI")
1518
+
1519
+ # with gr.Row():
1520
+ # # Compact left sidebar (25% width)
1521
+ # with gr.Column(scale=1):
1522
+ # # Status box
1523
+ # voice_status_display = gr.Textbox(
1524
+ # label="Status",
1525
+ # value="Click 'Start' to begin",
1526
+ # interactive=False,
1527
+ # lines=3,
1528
+ # max_lines=3
1529
+ # )
1530
+
1531
+ # # Control buttons stacked vertically
1532
+ # start_voice_btn = gr.Button(
1533
+ # "🎤 Start",
1534
+ # variant="primary",
1535
+ # size="lg"
1536
+ # )
1537
+
1538
+ # stop_voice_btn = gr.Button(
1539
+ # "⏹️ Stop",
1540
+ # variant="stop",
1541
+ # size="lg",
1542
+ # interactive=False
1543
+ # )
1544
+
1545
+ # test_connection_btn = gr.Button(
1546
+ # "🔧 Test",
1547
+ # variant="secondary",
1548
+ # size="sm"
1549
+ # )
1550
+
1551
+ # gr.Markdown("---")
1552
+
1553
+ # # Quick tips
1554
+ # gr.Markdown("""
1555
+ # **Quick Tips:**
1556
+ # • Upload documents first
1557
+ # • Ask specific questions
1558
+ # • Press Enter to send
1559
+ # """, elem_classes=["small-text"])
1560
+
1561
+ # # Main chat area (75% width)
1562
+ # with gr.Column(scale=3):
1563
+ # # Large chat window
1564
+ # voice_chatbot = gr.Chatbot(
1565
+ # type="messages",
1566
+ # height=550,
1567
+ # show_copy_button=True,
1568
+ # avatar_images=(None, "🤖"),
1569
+ # show_label=False,
1570
+ # container=True,
1571
+ # bubble_full_width=False
1572
+ # )
1573
+
1574
+ # # Input row
1575
+ # with gr.Row():
1576
+ # voice_input_text = gr.Textbox(
1577
+ # placeholder="Ask me anything about your documents...",
1578
+ # lines=2,
1579
+ # max_lines=4,
1580
+ # scale=4,
1581
+ # show_label=False,
1582
+ # container=False,
1583
+ # autofocus=True
1584
+ # )
1585
+ # send_voice_btn = gr.Button(
1586
+ # "Send",
1587
+ # scale=1,
1588
+ # variant="primary"
1589
+ # )
1590
+
1591
+ # # Footer actions
1592
+ # with gr.Row():
1593
+ # clear_chat_btn = gr.Button("Clear", size="sm")
1594
+ # with gr.Column(scale=3):
1595
+ # gr.Markdown("*Tip: Type your question and press Enter*")
1596
+
1597
+ # # Event handlers
1598
+ # start_voice_btn.click(
1599
+ # fn=start_voice_conversation,
1600
+ # outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
1601
+ # )
1602
+
1603
+ # stop_voice_btn.click(
1604
+ # fn=stop_voice_conversation,
1605
+ # outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
1606
+ # )
1607
+
1608
+ # send_voice_btn.click(
1609
+ # fn=send_voice_message_v6,
1610
+ # inputs=[voice_input_text, voice_chatbot],
1611
+ # outputs=[voice_chatbot, voice_input_text]
1612
+ # )
1613
+
1614
+ # voice_input_text.submit(
1615
+ # fn=send_voice_message_v6,
1616
+ # inputs=[voice_input_text, voice_chatbot],
1617
+ # outputs=[voice_chatbot, voice_input_text]
1618
+ # )
1619
+
1620
+ # clear_chat_btn.click(
1621
+ # fn=lambda: [],
1622
+ # outputs=[voice_chatbot]
1623
+ # )
1624
+
1625
+ # test_connection_btn.click(
1626
+ # fn=test_voice_connection,
1627
+ # outputs=[voice_status_display]
1628
+ # )
1629
+
1630
+ with gr.Tab("🎧 Podcast Studio"):
1631
+ gr.Markdown("""
1632
+ # 🎙️ AI Podcast Studio
1633
+ ## Transform Documents into Engaging Audio
1634
+ Convert your documents into professional podcast conversations with AI-generated voices.
1635
+ ### How It Works:
1636
+ 1. **Select Documents** - Choose 1-5 documents from your library
1637
+ 2. **Choose Style** - Pick conversation style (casual, educational, etc.)
1638
+ 3. **Set Duration** - Select podcast length (5-30 minutes)
1639
+ 4. **Select Voices** - Pick two AI hosts from available voices
1640
+ 5. **Generate** - AI creates natural dialogue discussing your content
1641
+ ### Powered By:
1642
+ - 🎵 **ElevenLabs AI** - Ultra-realistic voice synthesis
1643
+ - 🤖 **LLM** - Intelligent content analysis and script generation
1644
+ - 📚 **RAG** - Context-aware information retrieval
1645
+ ---
1646
+ """)
1647
+
1648
+ with gr.Row():
1649
+ with gr.Column(scale=2):
1650
+ # Configuration Panel
1651
+ with gr.Group():
1652
+ gr.Markdown("#### 📚 Select Content")
1653
+
1654
+ podcast_doc_selector = gr.CheckboxGroup(
1655
+ choices=get_document_choices(),
1656
+ label="Documents to Include",
1657
+ info="Choose 1-5 documents for best results",
1658
+ interactive=True,
1659
+ value=[]
1660
+ )
1661
+
1662
+ gr.Markdown("*Selected document IDs will be used for podcast generation*")
1663
+
1664
+ with gr.Accordion("🎨 Podcast Settings", open=True):
1665
+ with gr.Row():
1666
+ podcast_style = gr.Dropdown(
1667
+ label="Style",
1668
+ choices=["conversational", "educational", "technical", "casual"],
1669
+ value="conversational",
1670
+ info="Sets the tone and format"
1671
+ )
1672
+
1673
+ podcast_duration = gr.Slider(
1674
+ label="Duration (minutes)",
1675
+ minimum=5,
1676
+ maximum=30,
1677
+ value=10,
1678
+ step=5,
1679
+ info="Approximate length"
1680
+ )
1681
+
1682
+ gr.Markdown("#### 🗣️ Voice Selection")
1683
+ with gr.Row():
1684
+ host1_voice_selector = gr.Dropdown(
1685
+ label="Host 1",
1686
+ choices=["Rachel", "Adam", "Domi", "Bella", "Antoni", "Elli", "Josh"],
1687
+ value="Rachel"
1688
+ )
1689
+ host2_voice_selector = gr.Dropdown(
1690
+ label="Host 2",
1691
+ choices=["Adam", "Rachel", "Josh", "Sam", "Emily", "Antoni", "Arnold"],
1692
+ value="Adam"
1693
+ )
1694
+
1695
+ generate_podcast_btn = gr.Button(
1696
+ "🎙️ Generate Podcast",
1697
+ variant="primary",
1698
+ size="lg"
1699
+ )
1700
+
1701
+ podcast_status = gr.Textbox(
1702
+ label="Status",
1703
+ interactive=False,
1704
+ lines=2
1705
+ )
1706
+
1707
+ podcast_id_display = gr.Textbox(
1708
+ label="Podcast ID",
1709
+ interactive=False,
1710
+ visible=False
1711
+ )
1712
+
1713
+ with gr.Column(scale=3):
1714
+ # Output Panel - Latest Generated
1715
+ with gr.Group():
1716
+ gr.Markdown("#### 🎵 Latest Generated Podcast")
1717
+
1718
+ podcast_audio_player = gr.Audio(
1719
+ type="filepath",
1720
+ interactive=False,
1721
+ autoplay=True,
1722
+ show_label=False
1723
+ )
1724
+
1725
+ with gr.Accordion("📝 Transcript", open=False):
1726
+ podcast_transcript_display = gr.Markdown(
1727
+ value="*Transcript will appear after generation...*"
1728
+ )
1729
+
1730
+ podcast_library_container = gr.Column()
1731
+ with gr.Row():
1732
+ refresh_podcast_library_btn = gr.Button("🔄 Refresh Library", variant="secondary")
1733
+ podcast_library_status = gr.Textbox(
1734
+ label="Library Status",
1735
+ value="Click 'Refresh Library' to load podcasts",
1736
+ interactive=False,
1737
+ scale=3
1738
+ )
1739
+
1740
+ # Event handler for generate button
1741
+ generate_podcast_btn.click(
1742
+ fn=generate_podcast_ui,
1743
+ inputs=[
1744
+ podcast_doc_selector,
1745
+ podcast_style,
1746
+ podcast_duration,
1747
+ host1_voice_selector,
1748
+ host2_voice_selector
1749
+ ],
1750
+ outputs=[
1751
+ podcast_status,
1752
+ podcast_audio_player,
1753
+ podcast_transcript_display,
1754
+ podcast_id_display
1755
+ ]
1756
+ ).then(
1757
+ # Auto-refresh library after generation
1758
+ fn=load_podcast_library_ui,
1759
+ outputs=[podcast_library_container, podcast_library_status]
1760
+ )
1761
+
1762
+ # NEW SECTION: Podcast Library (at tab level, same as first gr.Row())
1763
+ gr.Markdown("---")
1764
+ gr.Markdown("### 📚 Podcast Library")
1765
+ gr.Markdown("Browse and play all your generated podcasts")
1766
+
1767
+ # Event handler for refresh button
1768
+ refresh_podcast_library_btn.click(
1769
+ fn=load_podcast_library_ui,
1770
+ outputs=[podcast_library_container, podcast_library_status]
1771
+ )
1772
+
1773
+ with gr.Tab("❓ Ask Questions"):
1774
+ with gr.Row():
1775
+ with gr.Column():
1776
+ gr.Markdown("""### Ask Questions About Your Documents
1777
+ The AI will search through all your uploaded documents to find relevant information
1778
+ and provide comprehensive answers with sources.""")
1779
+ qa_question_input = gr.Textbox(label="Your Question", placeholder="Ask anything about your documents...", lines=3)
1780
+ qa_btn_action = gr.Button("❓ Get Answer", variant="primary", size="lg")
1781
+ with gr.Column():
1782
+ qa_output_display = gr.Textbox(label="AI Answer", lines=20, placeholder="Answer will appear here with sources...")
1783
+
1784
+ with gr.Tab("🏷️ Generate Tags"):
1785
+ with gr.Row():
1786
+ with gr.Column():
1787
+ gr.Markdown("### Generate Document Tags")
1788
+ doc_dropdown_tag_visible = gr.Dropdown(label="Select Document to Tag", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
1789
+ tag_text_input = gr.Textbox(label="Or Paste Text to Generate Tags", placeholder="Paste any text here to generate tags...", lines=8)
1790
+ max_tags_slider = gr.Slider(label="Number of Tags", minimum=3, maximum=15, value=5, step=1)
1791
+ tag_btn_action = gr.Button("🏷️ Generate Tags", variant="primary", size="lg")
1792
+ with gr.Column():
1793
+ tag_output_display = gr.Textbox(label="Generated Tags", lines=10, placeholder="Tags will appear here...")
1794
+
1795
+ with gr.Tab("🔍 Search Documents"):
1796
+ gr.Markdown("""
1797
+ ### 🔎 Semantic Search
1798
+ Find relevant content across your entire document library using AI-powered semantic search.
1799
+ """)
1800
+
1801
+ with gr.Row():
1802
+ with gr.Column(scale=1):
1803
+ with gr.Group():
1804
+ search_query_input = gr.Textbox(
1805
+ label="Search Query",
1806
+ placeholder="What are you looking for?",
1807
+ lines=2,
1808
+ info="Use natural language to describe what you need"
1809
+ )
1810
+
1811
+ with gr.Accordion("🎛️ Search Options", open=False):
1812
+ search_top_k_slider = gr.Slider(
1813
+ label="Number of Results",
1814
+ minimum=1, maximum=20, value=5, step=1,
1815
+ info="More results = broader search"
1816
+ )
1817
+
1818
+ search_btn_action = gr.Button("🔍 Search", variant="primary", size="lg")
1819
+
1820
+ with gr.Column(scale=2):
1821
+ with gr.Group():
1822
+ search_output_display = gr.Textbox(
1823
+ label="Results",
1824
+ lines=20,
1825
+ placeholder="Search results will appear here...",
1826
+ show_copy_button=True
1827
+ )
1828
+
1829
+ all_dropdowns_to_update = [delete_doc_dropdown_visible, doc_dropdown_content,podcast_doc_selector]
1830
+
1831
+ refresh_outputs = [document_list_display] + [dd for dd in all_dropdowns_to_update]
1832
+ refresh_btn_library.click(fn=refresh_library, outputs=refresh_outputs)
1833
+
1834
+ upload_outputs = [upload_output_display, doc_id_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
1835
+ upload_btn_process.click(upload_and_process_file, inputs=[file_input_upload], outputs=upload_outputs)
1836
+
1837
+ delete_outputs = [delete_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
1838
+ delete_btn.click(delete_document_from_library, inputs=[delete_doc_dropdown_visible], outputs=delete_outputs)
1839
+
1840
+ search_btn_action.click(perform_search, inputs=[search_query_input, search_top_k_slider], outputs=[search_output_display])
1841
+ tag_btn_action.click(generate_tags_for_document, inputs=[doc_dropdown_tag_visible, tag_text_input, max_tags_slider], outputs=[tag_output_display])
1842
+ qa_btn_action.click(ask_question, inputs=[qa_question_input], outputs=[qa_output_display])
1843
+
1844
+ interface.load(
1845
+ fn=load_dashboard_stats,
1846
+ outputs=[total_docs, total_chunks, storage_size, recent_docs, vector_status, llm_status, voice_status]
1847
+ )
1848
+
1849
+ interface.load(fn=refresh_library, outputs=refresh_outputs)
1850
+ return interface
1851
+
1852
+ if __name__ == "__main__":
1853
+ gradio_interface = create_gradio_interface()
1854
+ gradio_interface.launch(mcp_server=True)
config.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+
8
+ class Config:
9
+ # API Keys
10
+ NEBIUS_API_KEY: Optional[str] = os.getenv("NEBIUS_API_KEY")
11
+ MISTRAL_API_KEY: Optional[str] = os.getenv("MISTRAL_API_KEY")
12
+ HUGGINGFACE_API_KEY: Optional[str] = os.getenv("HUGGINGFACE_API_KEY", os.getenv("HF_TOKEN"))
13
+ OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY")
14
+ ANTHROPIC_API_KEY: Optional[str] = os.getenv("ANTHROPIC_API_KEY")
15
+
16
+ # NEBIUS Configuration (OpenAI OSS models)
17
+ NEBIUS_BASE_URL: str = os.getenv("NEBIUS_BASE_URL", "https://api.studio.nebius.com/v1/")
18
+ NEBIUS_MODEL: str = os.getenv("NEBIUS_MODEL", "meta-llama/Llama-3.3-70B-Instruct")
19
+
20
+ # Model Configuration
21
+ # Using OpenAI managed embeddings for performance/quality
22
+ EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
23
+
24
+ MISTRAL_MODEL: str = os.getenv("MISTRAL_MODEL", "mistral-large-2407")
25
+ OPENAI_MODEL: str = os.getenv("OPENAI_MODEL", "gpt-5.1-chat-latest")
26
+ FAST_MODEL: str = os.getenv("FAST_MODEL", "gpt-5-mini")
27
+
28
+ # Vector Store Configuration
29
+ DATA_DIR: str = os.getenv("DATA_DIR", "./data")
30
+ VECTOR_STORE_PATH: str = os.getenv("VECTOR_STORE_PATH", "./data/vector_store")
31
+ DOCUMENT_STORE_PATH: str = os.getenv("DOCUMENT_STORE_PATH", "./data/documents")
32
+ INDEX_NAME: str = os.getenv("INDEX_NAME", "content_index")
33
+
34
+ # Processing Configuration
35
+ CHUNK_SIZE: int = int(os.getenv("CHUNK_SIZE", "500"))
36
+ CHUNK_OVERLAP: int = int(os.getenv("CHUNK_OVERLAP", "50"))
37
+ MAX_CONCURRENT_REQUESTS: int = int(os.getenv("MAX_CONCURRENT_REQUESTS", "5"))
38
+ # Search Configuration
39
+ DEFAULT_TOP_K: int = int(os.getenv("DEFAULT_TOP_K", "5"))
40
+ SIMILARITY_THRESHOLD: float = float(os.getenv("SIMILARITY_THRESHOLD", "0.3"))
41
+
42
+ # OCR Configuration
43
+ TESSERACT_PATH: Optional[str] = os.getenv("TESSERACT_PATH")
44
+ OCR_LANGUAGE: str = os.getenv("OCR_LANGUAGE", "eng")
45
+
46
+ # ElevenLabs Configuration
47
+ ELEVENLABS_API_KEY: Optional[str] = os.getenv("ELEVENLABS_API_KEY")
48
+ ELEVENLABS_AGENT_ID: Optional[str] = os.getenv("ELEVENLABS_AGENT_ID")
49
+ ELEVENLABS_VOICE_MODEL: str = os.getenv("ELEVENLABS_VOICE_MODEL", "Rachel")
50
+
51
+ # App Configuration
52
+ HOST: str = os.getenv("HOST", "0.0.0.0")
53
+ PORT: int = int(os.getenv("PORT", "7860"))
54
+ DEBUG: bool = os.getenv("DEBUG", "False").lower() == "true"
55
+
56
+ config = Config()
core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Core module initialization
core/chunker.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # chunker.py
2
+ import logging
3
+ from typing import List, Dict, Any, Optional
4
+ import re
5
+ from .models import Chunk
6
+ from .text_preprocessor import TextPreprocessor
7
+ import config
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class TextChunker:
12
+ def __init__(self):
13
+ self.config = config.config
14
+ self.preprocessor = TextPreprocessor()
15
+
16
+ self.chunk_size = self.config.CHUNK_SIZE
17
+ self.chunk_overlap = self.config.CHUNK_OVERLAP
18
+
19
+ def chunk_document(self, document_id: str, content: str, method: str = "recursive") -> List[Chunk]:
20
+ """Chunk a document using the specified method"""
21
+ if not content:
22
+ return []
23
+
24
+ try:
25
+ if method == "recursive":
26
+ return self._recursive_chunk(document_id, content)
27
+ elif method == "sentence":
28
+ return self._sentence_chunk(document_id, content)
29
+ elif method == "paragraph":
30
+ return self._paragraph_chunk(document_id, content)
31
+ elif method == "fixed":
32
+ return self._fixed_chunk(document_id, content)
33
+ else:
34
+ logger.warning(f"Unknown chunking method: {method}, using recursive")
35
+ return self._recursive_chunk(document_id, content)
36
+ except Exception as e:
37
+ logger.error(f"Error chunking document: {str(e)}")
38
+ # Fallback to simple fixed chunking
39
+ return self._fixed_chunk(document_id, content)
40
+
41
+ def _recursive_chunk(self, document_id: str, content: str) -> List[Chunk]:
42
+ """Recursively split text by different separators"""
43
+ chunks = []
44
+
45
+ # Define separators in order of preference
46
+ separators = [
47
+ "\n\n", # Paragraphs
48
+ "\n", # Lines
49
+ ". ", # Sentences
50
+ ", ", # Clauses
51
+ " " # Words
52
+ ]
53
+
54
+ def split_text(text: str, separators: List[str], chunk_size: int) -> List[str]:
55
+ if len(text) <= chunk_size:
56
+ return [text] if text.strip() else []
57
+
58
+ if not separators:
59
+ # If no separators left, split by character
60
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
61
+
62
+ separator = separators[0]
63
+ remaining_separators = separators[1:]
64
+
65
+ splits = text.split(separator)
66
+ result = []
67
+ current_chunk = ""
68
+
69
+ for split in splits:
70
+ if len(current_chunk) + len(split) + len(separator) <= chunk_size:
71
+ if current_chunk:
72
+ current_chunk += separator + split
73
+ else:
74
+ current_chunk = split
75
+ else:
76
+ if current_chunk:
77
+ result.append(current_chunk)
78
+
79
+ if len(split) > chunk_size:
80
+ # Split is too big, need to split further
81
+ result.extend(split_text(split, remaining_separators, chunk_size))
82
+ current_chunk = ""
83
+ else:
84
+ current_chunk = split
85
+
86
+ if current_chunk:
87
+ result.append(current_chunk)
88
+
89
+ return result
90
+
91
+ text_chunks = split_text(content, separators, self.chunk_size)
92
+
93
+ # Create chunk objects with overlap
94
+ for i, chunk_text in enumerate(text_chunks):
95
+ if not chunk_text.strip():
96
+ continue
97
+
98
+ # Calculate positions
99
+ start_pos = content.find(chunk_text)
100
+ if start_pos == -1:
101
+ start_pos = i * self.chunk_size
102
+ end_pos = start_pos + len(chunk_text)
103
+
104
+ # Add overlap from previous chunk if not the first chunk
105
+ if i > 0 and self.chunk_overlap > 0:
106
+ prev_chunk = text_chunks[i-1]
107
+ overlap_text = prev_chunk[-self.chunk_overlap:] if len(prev_chunk) > self.chunk_overlap else prev_chunk
108
+ chunk_text = overlap_text + " " + chunk_text
109
+
110
+ chunk = Chunk(
111
+ id=self._generate_chunk_id(document_id, i),
112
+ document_id=document_id,
113
+ content=chunk_text.strip(),
114
+ chunk_index=i,
115
+ start_pos=start_pos,
116
+ end_pos=end_pos,
117
+ metadata={
118
+ "chunk_method": "recursive",
119
+ "original_length": len(chunk_text),
120
+ "word_count": len(chunk_text.split())
121
+ }
122
+ )
123
+ chunks.append(chunk)
124
+
125
+ return chunks
126
+
127
+ def _sentence_chunk(self, document_id: str, content: str) -> List[Chunk]:
128
+ """Chunk text by sentences"""
129
+ chunks = []
130
+ sentences = self.preprocessor.extract_sentences(content)
131
+
132
+ current_chunk = ""
133
+ chunk_index = 0
134
+ start_pos = 0
135
+
136
+ for sentence in sentences:
137
+ if len(current_chunk) + len(sentence) <= self.chunk_size:
138
+ if current_chunk:
139
+ current_chunk += " " + sentence
140
+ else:
141
+ current_chunk = sentence
142
+ start_pos = content.find(sentence)
143
+ else:
144
+ if current_chunk:
145
+ chunk = Chunk(
146
+ id=self._generate_chunk_id(document_id, chunk_index),
147
+ document_id=document_id,
148
+ content=current_chunk.strip(),
149
+ chunk_index=chunk_index,
150
+ start_pos=start_pos,
151
+ end_pos=start_pos + len(current_chunk),
152
+ metadata={
153
+ "chunk_method": "sentence",
154
+ "sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
155
+ }
156
+ )
157
+ chunks.append(chunk)
158
+ chunk_index += 1
159
+
160
+ current_chunk = sentence
161
+ start_pos = content.find(sentence)
162
+
163
+ # Add final chunk
164
+ if current_chunk:
165
+ chunk = Chunk(
166
+ id=self._generate_chunk_id(document_id, chunk_index),
167
+ document_id=document_id,
168
+ content=current_chunk.strip(),
169
+ chunk_index=chunk_index,
170
+ start_pos=start_pos,
171
+ end_pos=start_pos + len(current_chunk),
172
+ metadata={
173
+ "chunk_method": "sentence",
174
+ "sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
175
+ }
176
+ )
177
+ chunks.append(chunk)
178
+
179
+ return chunks
180
+
181
+ def _paragraph_chunk(self, document_id: str, content: str) -> List[Chunk]:
182
+ """Chunk text by paragraphs"""
183
+ chunks = []
184
+ paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
185
+
186
+ current_chunk = ""
187
+ chunk_index = 0
188
+ start_pos = 0
189
+
190
+ for paragraph in paragraphs:
191
+ if len(current_chunk) + len(paragraph) <= self.chunk_size:
192
+ if current_chunk:
193
+ current_chunk += "\n\n" + paragraph
194
+ else:
195
+ current_chunk = paragraph
196
+ start_pos = content.find(paragraph)
197
+ else:
198
+ if current_chunk:
199
+ chunk = Chunk(
200
+ id=self._generate_chunk_id(document_id, chunk_index),
201
+ document_id=document_id,
202
+ content=current_chunk.strip(),
203
+ chunk_index=chunk_index,
204
+ start_pos=start_pos,
205
+ end_pos=start_pos + len(current_chunk),
206
+ metadata={
207
+ "chunk_method": "paragraph",
208
+ "paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
209
+ }
210
+ )
211
+ chunks.append(chunk)
212
+ chunk_index += 1
213
+
214
+ # If paragraph is too long, split it further
215
+ if len(paragraph) > self.chunk_size:
216
+ para_chunks = self._fixed_chunk(document_id, paragraph)
217
+ for pc in para_chunks:
218
+ pc.chunk_index = chunk_index
219
+ pc.id = self._generate_chunk_id(document_id, chunk_index)
220
+ chunks.append(pc)
221
+ chunk_index += 1
222
+ else:
223
+ current_chunk = paragraph
224
+ start_pos = content.find(paragraph)
225
+
226
+ # Add final chunk
227
+ if current_chunk:
228
+ chunk = Chunk(
229
+ id=self._generate_chunk_id(document_id, chunk_index),
230
+ document_id=document_id,
231
+ content=current_chunk.strip(),
232
+ chunk_index=chunk_index,
233
+ start_pos=start_pos,
234
+ end_pos=start_pos + len(current_chunk),
235
+ metadata={
236
+ "chunk_method": "paragraph",
237
+ "paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
238
+ }
239
+ )
240
+ chunks.append(chunk)
241
+
242
+ return chunks
243
+
244
+ def _fixed_chunk(self, document_id: str, content: str) -> List[Chunk]:
245
+ """Simple fixed-size chunking with overlap"""
246
+ chunks = []
247
+
248
+ for i in range(0, len(content), self.chunk_size - self.chunk_overlap):
249
+ chunk_text = content[i:i + self.chunk_size]
250
+
251
+ if not chunk_text.strip():
252
+ continue
253
+
254
+ chunk = Chunk(
255
+ id=self._generate_chunk_id(document_id, len(chunks)),
256
+ document_id=document_id,
257
+ content=chunk_text.strip(),
258
+ chunk_index=len(chunks),
259
+ start_pos=i,
260
+ end_pos=min(i + self.chunk_size, len(content)),
261
+ metadata={
262
+ "chunk_method": "fixed",
263
+ "original_length": len(chunk_text)
264
+ }
265
+ )
266
+ chunks.append(chunk)
267
+
268
+ return chunks
269
+
270
+ def _generate_chunk_id(self, document_id: str, chunk_index: int) -> str:
271
+ """Generate a unique chunk ID"""
272
+ return f"{document_id}_chunk_{chunk_index}"
273
+
274
+ def optimize_chunks_for_embedding(self, chunks: List[Chunk]) -> List[Chunk]:
275
+ """Optimize chunks for better embedding generation"""
276
+ optimized_chunks = []
277
+
278
+ for chunk in chunks:
279
+ # Clean the content for embedding
280
+ clean_content = self.preprocessor.prepare_for_embedding(chunk.content)
281
+
282
+ # Skip very short chunks
283
+ if len(clean_content.split()) < 5:
284
+ continue
285
+
286
+ # Update chunk with optimized content
287
+ optimized_chunk = Chunk(
288
+ id=chunk.id,
289
+ document_id=chunk.document_id,
290
+ content=clean_content,
291
+ chunk_index=chunk.chunk_index,
292
+ start_pos=chunk.start_pos,
293
+ end_pos=chunk.end_pos,
294
+ metadata={
295
+ **chunk.metadata,
296
+ "optimized_for_embedding": True,
297
+ "original_content_length": len(chunk.content),
298
+ "optimized_content_length": len(clean_content)
299
+ }
300
+ )
301
+ optimized_chunks.append(optimized_chunk)
302
+
303
+ return optimized_chunks
core/document_parser.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import tempfile
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional, Dict, Any
6
+ import asyncio
7
+
8
+ # Document processing libraries
9
+ import PyPDF2
10
+ from docx import Document as DocxDocument
11
+ from PIL import Image
12
+ import pytesseract
13
+
14
+ from .models import Document, DocumentType
15
+ import config
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class DocumentParser:
20
+ def __init__(self):
21
+ self.config = config.config
22
+
23
+ async def parse_document(self, file_path: str, filename: str) -> Document:
24
+ """Parse a document and extract its content"""
25
+ try:
26
+ file_ext = Path(filename).suffix.lower()
27
+ file_size = os.path.getsize(file_path)
28
+
29
+ # Determine document type and parse accordingly
30
+ if file_ext == '.pdf':
31
+ content = await self._parse_pdf(file_path)
32
+ doc_type = DocumentType.PDF
33
+ elif file_ext == '.txt':
34
+ content = await self._parse_text(file_path)
35
+ doc_type = DocumentType.TEXT
36
+ elif file_ext == '.docx':
37
+ content = await self._parse_docx(file_path)
38
+ doc_type = DocumentType.DOCX
39
+ elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
40
+ content = await self._parse_image(file_path)
41
+ doc_type = DocumentType.IMAGE
42
+ else:
43
+ raise ValueError(f"Unsupported file type: {file_ext}")
44
+
45
+ # Create document object
46
+ document = Document(
47
+ id=self._generate_document_id(),
48
+ filename=filename,
49
+ content=content,
50
+ doc_type=doc_type,
51
+ file_size=file_size,
52
+ metadata={
53
+ "file_extension": file_ext,
54
+ "content_length": len(content),
55
+ "word_count": len(content.split()) if content else 0
56
+ }
57
+ )
58
+
59
+ logger.info(f"Successfully parsed document: {filename}")
60
+ return document
61
+
62
+ except Exception as e:
63
+ logger.error(f"Error parsing document {filename}: {str(e)}")
64
+ raise
65
+
66
+ async def _parse_pdf(self, file_path: str) -> str:
67
+ """Extract text from PDF file"""
68
+ try:
69
+ content = ""
70
+ with open(file_path, 'rb') as file:
71
+ pdf_reader = PyPDF2.PdfReader(file)
72
+ for page_num, page in enumerate(pdf_reader.pages):
73
+ try:
74
+ page_text = page.extract_text()
75
+ if page_text.strip():
76
+ content += f"\n--- Page {page_num + 1} ---\n"
77
+ content += page_text + "\n"
78
+ except Exception as e:
79
+ logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
80
+ continue
81
+
82
+ return content.strip()
83
+ except Exception as e:
84
+ logger.error(f"Error parsing PDF: {str(e)}")
85
+ raise
86
+
87
+ async def _parse_text(self, file_path: str) -> str:
88
+ """Read plain text file"""
89
+ try:
90
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
91
+ content = file.read()
92
+ return content.strip()
93
+ except Exception as e:
94
+ logger.error(f"Error parsing text file: {str(e)}")
95
+ raise
96
+
97
+ async def _parse_docx(self, file_path: str) -> str:
98
+ """Extract text from DOCX file"""
99
+ try:
100
+ doc = DocxDocument(file_path)
101
+ content = ""
102
+
103
+ for paragraph in doc.paragraphs:
104
+ if paragraph.text.strip():
105
+ content += paragraph.text + "\n"
106
+
107
+ # Extract text from tables
108
+ for table in doc.tables:
109
+ for row in table.rows:
110
+ row_text = []
111
+ for cell in row.cells:
112
+ if cell.text.strip():
113
+ row_text.append(cell.text.strip())
114
+ if row_text:
115
+ content += " | ".join(row_text) + "\n"
116
+
117
+ return content.strip()
118
+ except Exception as e:
119
+ logger.error(f"Error parsing DOCX file: {str(e)}")
120
+ raise
121
+
122
+ async def _parse_image(self, file_path: str) -> str:
123
+ """Extract text from image using OCR"""
124
+ try:
125
+ # First try with OCR service if available
126
+ if hasattr(self, 'ocr_service') and self.ocr_service:
127
+ logger.info(f"Using OCR service for image: {file_path}")
128
+ text = await self.ocr_service.extract_text_from_image(file_path)
129
+ if text:
130
+ return text
131
+
132
+ # Fallback to direct pytesseract
133
+ logger.info(f"Using direct pytesseract for image: {file_path}")
134
+ image = Image.open(file_path)
135
+
136
+ # Perform OCR
137
+ content = pytesseract.image_to_string(
138
+ image,
139
+ lang=self.config.OCR_LANGUAGE,
140
+ config='--psm 6' # Assume a single uniform block of text
141
+ )
142
+
143
+ return content.strip()
144
+ except Exception as e:
145
+ logger.error(f"Error performing OCR on image: {str(e)}")
146
+ # Return empty string if OCR fails
147
+ return ""
148
+
149
+ def _generate_document_id(self) -> str:
150
+ """Generate a unique document ID"""
151
+ import uuid
152
+ return str(uuid.uuid4())
153
+
154
+ async def extract_metadata(self, file_path: str, content: str) -> Dict[str, Any]:
155
+ """Extract additional metadata from the document"""
156
+ try:
157
+ metadata = {}
158
+
159
+ # Basic statistics
160
+ metadata["content_length"] = len(content)
161
+ metadata["word_count"] = len(content.split()) if content else 0
162
+ metadata["line_count"] = len(content.splitlines()) if content else 0
163
+
164
+ # File information
165
+ file_stat = os.stat(file_path)
166
+ metadata["file_size"] = file_stat.st_size
167
+ metadata["created_time"] = file_stat.st_ctime
168
+ metadata["modified_time"] = file_stat.st_mtime
169
+
170
+ # Content analysis
171
+ if content:
172
+ # Language detection (simple heuristic)
173
+ metadata["estimated_language"] = self._detect_language(content)
174
+
175
+ # Reading time estimation (average 200 words per minute)
176
+ metadata["estimated_reading_time_minutes"] = max(1, metadata["word_count"] // 200)
177
+
178
+ return metadata
179
+ except Exception as e:
180
+ logger.error(f"Error extracting metadata: {str(e)}")
181
+ return {}
182
+
183
+ def _detect_language(self, content: str) -> str:
184
+ """Simple language detection based on character patterns"""
185
+ # This is a very basic implementation
186
+ # In production, you might want to use a proper language detection library
187
+ if not content:
188
+ return "unknown"
189
+
190
+ # Count common English words
191
+ english_words = ["the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "as", "is", "was", "are", "were", "be", "been", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "this", "that", "these", "those"]
192
+
193
+ words = content.lower().split()
194
+ english_count = sum(1 for word in words if word in english_words)
195
+
196
+ if len(words) > 0 and english_count / len(words) > 0.1:
197
+ return "en"
198
+ else:
199
+ return "unknown"
core/models.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional, Dict, Any
3
+ from datetime import datetime
4
+ from enum import Enum
5
+
6
+ class DocumentType(str, Enum):
7
+ PDF = "pdf"
8
+ TEXT = "txt"
9
+ DOCX = "docx"
10
+ IMAGE = "image"
11
+ HTML = "html"
12
+
13
+ class ProcessingStatus(str, Enum):
14
+ PENDING = "pending"
15
+ PROCESSING = "processing"
16
+ COMPLETED = "completed"
17
+ FAILED = "failed"
18
+
19
+ class Document(BaseModel):
20
+ id: str = Field(..., description="Unique document identifier")
21
+ filename: str = Field(..., description="Original filename")
22
+ content: str = Field(..., description="Extracted text content")
23
+ doc_type: DocumentType = Field(..., description="Document type")
24
+ file_size: int = Field(..., description="File size in bytes")
25
+ created_at: datetime = Field(default_factory=datetime.utcnow)
26
+ metadata: Dict[str, Any] = Field(default_factory=dict)
27
+ tags: List[str] = Field(default_factory=list)
28
+ summary: Optional[str] = None
29
+ category: Optional[str] = None
30
+ language: Optional[str] = None
31
+
32
+ def to_dict(self) -> Dict[str, Any]:
33
+ return {
34
+ "id": self.id,
35
+ "filename": self.filename,
36
+ "content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
37
+ "doc_type": self.doc_type,
38
+ "file_size": self.file_size,
39
+ "created_at": self.created_at.isoformat(),
40
+ "metadata": self.metadata,
41
+ "tags": self.tags,
42
+ "summary": self.summary,
43
+ "category": self.category,
44
+ "language": self.language
45
+ }
46
+
47
+ class Chunk(BaseModel):
48
+ id: str = Field(..., description="Unique chunk identifier")
49
+ document_id: str = Field(..., description="Parent document ID")
50
+ content: str = Field(..., description="Chunk text content")
51
+ chunk_index: int = Field(..., description="Position in document")
52
+ start_pos: int = Field(..., description="Start position in original document")
53
+ end_pos: int = Field(..., description="End position in original document")
54
+ embedding: Optional[List[float]] = None
55
+ metadata: Dict[str, Any] = Field(default_factory=dict)
56
+
57
+ class SearchResult(BaseModel):
58
+ chunk_id: str = Field(..., description="Matching chunk ID")
59
+ document_id: str = Field(..., description="Source document ID")
60
+ content: str = Field(..., description="Matching content")
61
+ score: float = Field(..., description="Similarity score")
62
+ metadata: Dict[str, Any] = Field(default_factory=dict)
63
+
64
+ def to_dict(self) -> Dict[str, Any]:
65
+ return {
66
+ "chunk_id": self.chunk_id,
67
+ "document_id": self.document_id,
68
+ "content": self.content,
69
+ "score": self.score,
70
+ "metadata": self.metadata
71
+ }
72
+
73
+ class ProcessingTask(BaseModel):
74
+ task_id: str = Field(..., description="Unique task identifier")
75
+ document_id: Optional[str] = None
76
+ status: ProcessingStatus = ProcessingStatus.PENDING
77
+ progress: float = Field(default=0.0, ge=0.0, le=100.0)
78
+ message: Optional[str] = None
79
+ error: Optional[str] = None
80
+ created_at: datetime = Field(default_factory=datetime.utcnow)
81
+ updated_at: datetime = Field(default_factory=datetime.utcnow)
82
+
83
+ class SummaryRequest(BaseModel):
84
+ content: Optional[str] = None
85
+ document_id: Optional[str] = None
86
+ style: str = Field(default="concise", description="Summary style")
87
+ max_length: Optional[int] = None
88
+
89
+ class TagGenerationRequest(BaseModel):
90
+ content: Optional[str] = None
91
+ document_id: Optional[str] = None
92
+ max_tags: int = Field(default=5, ge=1, le=20)
93
+
94
+ class QuestionAnswerRequest(BaseModel):
95
+ question: str = Field(..., description="Question to answer")
96
+ context_filter: Optional[Dict[str, Any]] = None
97
+ max_context_length: int = Field(default=2000)
98
+
99
+ class CategorizationRequest(BaseModel):
100
+ content: Optional[str] = None
101
+ document_id: Optional[str] = None
102
+ categories: Optional[List[str]] = None
core/text_preprocessor.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ from typing import List, Optional
4
+ import unicodedata
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class TextPreprocessor:
9
+ def __init__(self):
10
+ # Common stop words for basic filtering
11
+ self.stop_words = {
12
+ 'en': set([
13
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
14
+ 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
15
+ 'before', 'after', 'above', 'below', 'between', 'among', 'throughout',
16
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
17
+ 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
18
+ 'must', 'shall', 'can', 'this', 'that', 'these', 'those', 'i', 'me',
19
+ 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours'
20
+ ])
21
+ }
22
+
23
+ def clean_text(self, text: str, aggressive: bool = False) -> str:
24
+ """Clean and normalize text"""
25
+ if not text:
26
+ return ""
27
+
28
+ try:
29
+ # Normalize unicode characters
30
+ text = unicodedata.normalize('NFKD', text)
31
+
32
+ # Remove excessive whitespace
33
+ text = re.sub(r'\s+', ' ', text)
34
+
35
+ # Remove or replace special characters
36
+ if aggressive:
37
+ # More aggressive cleaning for embedding
38
+ text = re.sub(r'[^\w\s\-.,!?;:]', ' ', text)
39
+ text = re.sub(r'[.,!?;:]+', '.', text)
40
+ else:
41
+ # Basic cleaning for readability
42
+ text = re.sub(r'[^\w\s\-.,!?;:()\[\]{}"\']', ' ', text)
43
+
44
+ # Remove excessive punctuation
45
+ text = re.sub(r'\.{2,}', '.', text)
46
+ text = re.sub(r'[!?]{2,}', '!', text)
47
+
48
+ # Clean up whitespace again
49
+ text = re.sub(r'\s+', ' ', text)
50
+
51
+ # Remove leading/trailing whitespace
52
+ text = text.strip()
53
+
54
+ return text
55
+ except Exception as e:
56
+ logger.error(f"Error cleaning text: {str(e)}")
57
+ return text
58
+
59
+ def extract_sentences(self, text: str) -> List[str]:
60
+ """Extract sentences from text"""
61
+ if not text:
62
+ return []
63
+
64
+ try:
65
+ # Simple sentence splitting
66
+ sentences = re.split(r'[.!?]+', text)
67
+
68
+ # Clean and filter sentences
69
+ clean_sentences = []
70
+ for sentence in sentences:
71
+ sentence = sentence.strip()
72
+ if len(sentence) > 10: # Minimum sentence length
73
+ clean_sentences.append(sentence)
74
+
75
+ return clean_sentences
76
+ except Exception as e:
77
+ logger.error(f"Error extracting sentences: {str(e)}")
78
+ return [text]
79
+
80
+ def extract_keywords(self, text: str, language: str = 'en', max_keywords: int = 20) -> List[str]:
81
+ """Extract potential keywords from text"""
82
+ if not text:
83
+ return []
84
+
85
+ try:
86
+ # Convert to lowercase and split into words
87
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
88
+
89
+ # Remove stop words
90
+ stop_words = self.stop_words.get(language, set())
91
+ keywords = [word for word in words if word not in stop_words]
92
+
93
+ # Count word frequency
94
+ word_freq = {}
95
+ for word in keywords:
96
+ word_freq[word] = word_freq.get(word, 0) + 1
97
+
98
+ # Sort by frequency and return top keywords
99
+ sorted_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
100
+
101
+ return [word for word, freq in sorted_keywords[:max_keywords]]
102
+ except Exception as e:
103
+ logger.error(f"Error extracting keywords: {str(e)}")
104
+ return []
105
+
106
+ def prepare_for_embedding(self, text: str) -> str:
107
+ """Prepare text specifically for embedding generation"""
108
+ if not text:
109
+ return ""
110
+
111
+ try:
112
+ # Clean text aggressively for better embeddings
113
+ clean_text = self.clean_text(text, aggressive=True)
114
+
115
+ # Remove very short words
116
+ words = clean_text.split()
117
+ filtered_words = [word for word in words if len(word) >= 2]
118
+
119
+ # Rejoin and ensure reasonable length
120
+ result = ' '.join(filtered_words)
121
+
122
+ # Truncate if too long (most embedding models have token limits)
123
+ if len(result) > 5000: # Rough character limit
124
+ result = result[:5000] + "..."
125
+
126
+ return result
127
+ except Exception as e:
128
+ logger.error(f"Error preparing text for embedding: {str(e)}")
129
+ return text
130
+
131
+ def extract_metadata_from_text(self, text: str) -> dict:
132
+ """Extract metadata from text content"""
133
+ if not text:
134
+ return {}
135
+
136
+ try:
137
+ metadata = {}
138
+
139
+ # Basic statistics
140
+ metadata['character_count'] = len(text)
141
+ metadata['word_count'] = len(text.split())
142
+ metadata['sentence_count'] = len(self.extract_sentences(text))
143
+ metadata['paragraph_count'] = len([p for p in text.split('\n\n') if p.strip()])
144
+
145
+ # Content characteristics
146
+ metadata['avg_word_length'] = sum(len(word) for word in text.split()) / max(1, len(text.split()))
147
+ metadata['avg_sentence_length'] = metadata['word_count'] / max(1, metadata['sentence_count'])
148
+
149
+ # Special content detection
150
+ metadata['has_urls'] = bool(re.search(r'https?://\S+', text))
151
+ metadata['has_emails'] = bool(re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text))
152
+ metadata['has_phone_numbers'] = bool(re.search(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text))
153
+ metadata['has_dates'] = bool(re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text))
154
+ metadata['has_numbers'] = bool(re.search(r'\b\d+\b', text))
155
+
156
+ # Language indicators
157
+ metadata['punctuation_density'] = len(re.findall(r'[.,!?;:]', text)) / max(1, len(text))
158
+ metadata['caps_ratio'] = len(re.findall(r'[A-Z]', text)) / max(1, len(text))
159
+
160
+ return metadata
161
+ except Exception as e:
162
+ logger.error(f"Error extracting text metadata: {str(e)}")
163
+ return {}
164
+
165
+ def normalize_for_search(self, text: str) -> str:
166
+ """Normalize text for search queries"""
167
+ if not text:
168
+ return ""
169
+
170
+ try:
171
+ # Convert to lowercase
172
+ text = text.lower()
173
+
174
+ # Remove special characters but keep spaces
175
+ text = re.sub(r'[^\w\s]', ' ', text)
176
+
177
+ # Normalize whitespace
178
+ text = re.sub(r'\s+', ' ', text)
179
+
180
+ # Strip leading/trailing whitespace
181
+ text = text.strip()
182
+
183
+ return text
184
+ except Exception as e:
185
+ logger.error(f"Error normalizing text for search: {str(e)}")
186
+ return text
mcp_server.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from typing import Dict, Any, List, Optional
4
+ from pathlib import Path
5
+
6
+ from mcp.server.fastmcp import FastMCP
7
+
8
+ from services.vector_store_service import VectorStoreService
9
+ from services.document_store_service import DocumentStoreService
10
+ from services.embedding_service import EmbeddingService
11
+ from services.llm_service import LLMService
12
+ from services.ocr_service import OCRService
13
+
14
+ from mcp_tools.ingestion_tool import IngestionTool
15
+ from mcp_tools.search_tool import SearchTool
16
+ from mcp_tools.generative_tool import GenerativeTool
17
+
18
+ # Phase 2 & 3: Voice and Podcast
19
+ from services.llamaindex_service import LlamaIndexService
20
+ from services.elevenlabs_service import ElevenLabsService
21
+ from services.podcast_generator_service import PodcastGeneratorService
22
+ from mcp_tools.voice_tool import VoiceTool
23
+ from mcp_tools.podcast_tool import PodcastTool
24
+
25
+ logging.basicConfig(level=logging.INFO)
26
+ logger = logging.getLogger(__name__)
27
+
28
+ logger.info("Initializing services for FastMCP...")
29
+ vector_store_service = VectorStoreService()
30
+ document_store_service = DocumentStoreService()
31
+ embedding_service_instance = EmbeddingService()
32
+ llm_service_instance = LLMService()
33
+ ocr_service_instance = OCRService()
34
+
35
+ ingestion_tool_instance = IngestionTool(
36
+ vector_store=vector_store_service,
37
+ document_store=document_store_service,
38
+ embedding_service=embedding_service_instance,
39
+ ocr_service=ocr_service_instance
40
+ )
41
+ search_tool_instance = SearchTool(
42
+ vector_store=vector_store_service,
43
+ embedding_service=embedding_service_instance,
44
+ document_store=document_store_service
45
+ )
46
+ generative_tool_instance = GenerativeTool(
47
+ llm_service=llm_service_instance,
48
+ search_tool=search_tool_instance
49
+ )
50
+
51
+ # Phase 2 & 3 Services
52
+ logger.info("Initializing Phase 2 & 3 services...")
53
+ llamaindex_service_instance = LlamaIndexService(document_store_service)
54
+ elevenlabs_service_instance = ElevenLabsService(llamaindex_service_instance)
55
+ podcast_generator_instance = PodcastGeneratorService(
56
+ llamaindex_service=llamaindex_service_instance,
57
+ llm_service=llm_service_instance
58
+ )
59
+
60
+ voice_tool_instance = VoiceTool(elevenlabs_service_instance)
61
+ podcast_tool_instance = PodcastTool(podcast_generator_instance)
62
+
63
+ mcp = FastMCP("")
64
+ logger.info("FastMCP server initialized.")
65
+
66
+ @mcp.tool()
67
+ async def ingest_document(file_path: str, file_type: Optional[str] = None) -> Dict[str, Any]:
68
+ """
69
+ Process and index a document from a local file path for searching.
70
+ Automatically determines file_type if not provided.
71
+ """
72
+ logger.info(f"Tool 'ingest_document' called with file_path: {file_path}, file_type: {file_type}")
73
+ try:
74
+ actual_file_type = file_type
75
+ if not actual_file_type:
76
+ actual_file_type = Path(file_path).suffix.lower().strip('.')
77
+ logger.info(f"Inferred file_type: {actual_file_type}")
78
+ result = await ingestion_tool_instance.process_document(file_path, actual_file_type)
79
+ logger.info(f"Ingestion result: {result}")
80
+ return result
81
+ except Exception as e:
82
+ logger.error(f"Error in 'ingest_document' tool: {str(e)}", exc_info=True)
83
+ return {"success": False, "error": str(e)}
84
+
85
+ @mcp.tool()
86
+ async def semantic_search(query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
87
+ """
88
+ Search through indexed content using natural language.
89
+ 'filters' can be used to narrow down the search.
90
+ """
91
+ logger.info(f"Tool 'semantic_search' called with query: {query}, top_k: {top_k}, filters: {filters}")
92
+ try:
93
+ results = await search_tool_instance.search(query, top_k, filters)
94
+ return {
95
+ "success": True,
96
+ "query": query,
97
+ "results": [result.to_dict() for result in results],
98
+ "total_results": len(results)
99
+ }
100
+ except Exception as e:
101
+ logger.error(f"Error in 'semantic_search' tool: {str(e)}", exc_info=True)
102
+ return {"success": False, "error": str(e), "results": []}
103
+
104
+ @mcp.tool()
105
+ async def summarize_content(
106
+ content: Optional[str] = None,
107
+ document_id: Optional[str] = None,
108
+ style: str = "concise"
109
+ ) -> Dict[str, Any]:
110
+ """
111
+ Generate a summary of provided content or a document_id.
112
+ Available styles: concise, detailed, bullet_points, executive.
113
+ """
114
+ logger.info(f"Tool 'summarize_content' called. doc_id: {document_id}, style: {style}, has_content: {content is not None}")
115
+ try:
116
+ text_to_summarize = content
117
+ if document_id and not text_to_summarize:
118
+ doc = await document_store_service.get_document(document_id)
119
+ if not doc:
120
+ return {"success": False, "error": f"Document {document_id} not found"}
121
+ text_to_summarize = doc.content
122
+ if not text_to_summarize:
123
+ return {"success": False, "error": "No content provided for summarization"}
124
+ max_length = 10000
125
+ if len(text_to_summarize) > max_length:
126
+ logger.warning(f"Content for summarization is long ({len(text_to_summarize)} chars), truncating to {max_length}")
127
+ text_to_summarize = text_to_summarize[:max_length] + "..."
128
+ summary = await generative_tool_instance.summarize(text_to_summarize, style)
129
+ return {
130
+ "success": True,
131
+ "summary": summary,
132
+ "original_length": len(text_to_summarize),
133
+ "summary_length": len(summary),
134
+ "style": style
135
+ }
136
+ except Exception as e:
137
+ logger.error(f"Error in 'summarize_content' tool: {str(e)}", exc_info=True)
138
+ return {"success": False, "error": str(e)}
139
+
140
+ @mcp.tool()
141
+ async def generate_tags(
142
+ content: Optional[str] = None,
143
+ document_id: Optional[str] = None,
144
+ max_tags: int = 5
145
+ ) -> Dict[str, Any]:
146
+ """
147
+ Generate relevant tags for content or a document_id.
148
+ Saves tags to document metadata if document_id is provided.
149
+ """
150
+ logger.info(f"Tool 'generate_tags' called. doc_id: {document_id}, max_tags: {max_tags}, has_content: {content is not None}")
151
+ try:
152
+ text_for_tags = content
153
+ if document_id and not text_for_tags:
154
+ doc = await document_store_service.get_document(document_id)
155
+ if not doc:
156
+ return {"success": False, "error": f"Document {document_id} not found"}
157
+ text_for_tags = doc.content
158
+ if not text_for_tags:
159
+ return {"success": False, "error": "No content provided for tag generation"}
160
+ tags = await generative_tool_instance.generate_tags(text_for_tags, max_tags)
161
+ if document_id and tags:
162
+ await document_store_service.update_document_metadata(document_id, {"tags": tags})
163
+ logger.info(f"Tags {tags} saved for document {document_id}")
164
+ return {
165
+ "success": True,
166
+ "tags": tags,
167
+ "content_length": len(text_for_tags),
168
+ "document_id": document_id
169
+ }
170
+ except Exception as e:
171
+ logger.error(f"Error in 'generate_tags' tool: {str(e)}", exc_info=True)
172
+ return {"success": False, "error": str(e)}
173
+
174
+ @mcp.tool()
175
+ async def answer_question(question: str, context_filter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
176
+ """
177
+ Answer questions using RAG (Retrieval Augmented Generation) over indexed content.
178
+ 'context_filter' can be used to narrow down the context search.
179
+ """
180
+ logger.info(f"Tool 'answer_question' called with question: {question}, context_filter: {context_filter}")
181
+ try:
182
+ search_results = await search_tool_instance.search(question, top_k=5, filters=context_filter)
183
+ if not search_results:
184
+ return {
185
+ "success": False,
186
+ "error": "No relevant context found. Please upload relevant documents.",
187
+ "question": question,
188
+ "answer": "I could not find enough information in the documents to answer your question."
189
+ }
190
+ answer = await generative_tool_instance.answer_question(question, search_results)
191
+ return {
192
+ "success": True,
193
+ "question": question,
194
+ "answer": answer,
195
+ "sources": [result.to_dict() for result in search_results],
196
+ "confidence": "high" if len(search_results) >= 3 else "medium"
197
+ }
198
+ except Exception as e:
199
+ logger.error(f"Error in 'answer_question' tool: {str(e)}", exc_info=True)
200
+ return {"success": False, "error": str(e)}
201
+
202
+ @mcp.tool()
203
+ async def voice_qa(question: str, session_id: Optional[str] = None) -> Dict[str, Any]:
204
+ """
205
+ Ask a question using the AI voice assistant with RAG capabilities.
206
+ Provides text-based Q&A powered by LlamaIndex agentic search.
207
+ """
208
+ logger.info(f"Tool 'voice_qa' called with question: {question}")
209
+ try:
210
+ result = await voice_tool_instance.voice_qa(question, session_id)
211
+ return result
212
+ except Exception as e:
213
+ logger.error(f"Error in 'voice_qa' tool: {str(e)}", exc_info=True)
214
+ return {"success": False, "error": str(e)}
215
+
216
+ @mcp.tool()
217
+ async def generate_podcast(
218
+ document_ids: List[str],
219
+ style: str = "conversational",
220
+ duration_minutes: int = 10,
221
+ host1_voice: str = "Rachel",
222
+ host2_voice: str = "Adam"
223
+ ) -> Dict[str, Any]:
224
+ """
225
+ Generate a podcast from selected documents.
226
+ Styles: conversational, educational, technical, casual.
227
+ Duration: 5-30 minutes recommended.
228
+ Voices: Rachel, Adam, Domi, Bella, Antoni, Josh, Sam, Emily, etc.
229
+ """
230
+ logger.info(f"Tool 'generate_podcast' called with {len(document_ids)} docs, style: {style}")
231
+ try:
232
+ result = await podcast_tool_instance.generate_podcast(
233
+ document_ids=document_ids,
234
+ style=style,
235
+ duration_minutes=duration_minutes,
236
+ host2_voice=host2_voice
237
+ )
238
+ return result
239
+ except Exception as e:
240
+ logger.error(f"Error in 'generate_podcast' tool: {str(e)}", exc_info=True)
241
+ return {"success": False, "error": str(e)}
242
+
243
+ @mcp.tool()
244
+ async def generate_podcast_transcript(
245
+ document_ids: List[str],
246
+ style: str = "conversational",
247
+ duration_minutes: int = 10
248
+ ) -> Dict[str, Any]:
249
+ """
250
+ Generate a podcast script/transcript WITHOUT generating audio.
251
+ Useful for previewing content before spending credits on audio generation.
252
+ """
253
+ logger.info(f"Tool 'generate_podcast_transcript' called")
254
+ try:
255
+ return await podcast_tool_instance.generate_transcript(
256
+ document_ids=document_ids,
257
+ style=style,
258
+ duration_minutes=duration_minutes
259
+ )
260
+ except Exception as e:
261
+ logger.error(f"Error in 'generate_podcast_transcript': {str(e)}")
262
+ return {"success": False, "error": str(e)}
263
+
264
+ @mcp.tool()
265
+ async def list_podcasts(limit: int = 10) -> Dict[str, Any]:
266
+ """
267
+ List previously generated podcasts with their metadata.
268
+ """
269
+ logger.info(f"Tool 'list_podcasts' called")
270
+ try:
271
+ return podcast_tool_instance.list_podcasts(limit)
272
+ except Exception as e:
273
+ logger.error(f"Error in 'list_podcasts': {str(e)}")
274
+ return {"success": False, "error": str(e)}
275
+
276
+ @mcp.tool()
277
+ async def get_podcast(podcast_id: str) -> Dict[str, Any]:
278
+ """
279
+ Get metadata for a specific podcast.
280
+ """
281
+ logger.info(f"Tool 'get_podcast' called for {podcast_id}")
282
+ try:
283
+ return podcast_tool_instance.get_podcast(podcast_id)
284
+ except Exception as e:
285
+ logger.error(f"Error in 'get_podcast': {str(e)}")
286
+ return {"success": False, "error": str(e)}
287
+
288
+ @mcp.tool()
289
+ async def get_podcast_audio(podcast_id: str) -> Dict[str, Any]:
290
+ """
291
+ Get the audio file path for a generated podcast.
292
+ """
293
+ logger.info(f"Tool 'get_podcast_audio' called for {podcast_id}")
294
+ try:
295
+ return podcast_tool_instance.get_podcast_audio(podcast_id)
296
+ except Exception as e:
297
+ logger.error(f"Error in 'get_podcast_audio': {str(e)}")
298
+ return {"success": False, "error": str(e)}
299
+
300
+ @mcp.tool()
301
+ async def list_documents_for_ui(limit: int = 100, offset: int = 0) -> Dict[str, Any]:
302
+ """
303
+ (UI Helper) List documents from the document store.
304
+ Not a standard processing tool, but useful for UI population.
305
+ """
306
+ logger.info(f"Tool 'list_documents_for_ui' called with limit: {limit}, offset: {offset}")
307
+ try:
308
+ documents = await document_store_service.list_documents(limit, offset)
309
+ return {
310
+ "success": True,
311
+ "documents": [doc.to_dict() for doc in documents],
312
+ "total": len(documents)
313
+ }
314
+ except Exception as e:
315
+ logger.error(f"Error in 'list_documents_for_ui' tool: {str(e)}", exc_info=True)
316
+ return {"success": False, "error": str(e), "documents": []}
317
+
318
+ # NOTE: FastMCP server startup disabled to avoid conflict with Gradio MCP
319
+ # Gradio MCP (in app.py) will handle MCP tool exposure via /gradio_api/mcp/sse
320
+ # If you need FastMCP separately, run this file directly with different port
321
+ # if __name__ == "__main__":
322
+ # logger.info("Starting FastMCP server...")
323
+ # asyncio.run(mcp.run(transport="sse", host="0.0.0.0", port=7860))
324
+
mcp_tools/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # MCP tools module initialization
mcp_tools/generative_tool.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Any, Optional
3
+ import asyncio
4
+
5
+ from services.llm_service import LLMService
6
+ from mcp_tools.search_tool import SearchTool
7
+ from core.models import SearchResult
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class GenerativeTool:
12
+ def __init__(self, llm_service: LLMService, search_tool: Optional[SearchTool] = None):
13
+ self.llm_service = llm_service
14
+ self.search_tool = search_tool
15
+
16
+ async def summarize(self, content: str, style: str = "concise", max_length: Optional[int] = None) -> str:
17
+ """Generate a summary of the given content"""
18
+ try:
19
+ if not content.strip():
20
+ return "No content provided for summarization."
21
+
22
+ logger.info(f"Generating {style} summary for content of length {len(content)}")
23
+
24
+ summary = await self.llm_service.summarize(content, style, max_length)
25
+
26
+ logger.info(f"Generated summary of length {len(summary)}")
27
+ return summary
28
+
29
+ except Exception as e:
30
+ logger.error(f"Error generating summary: {str(e)}")
31
+ return f"Error generating summary: {str(e)}"
32
+
33
+ async def generate_tags(self, content: str, max_tags: int = 5) -> List[str]:
34
+ """Generate relevant tags for the given content"""
35
+ try:
36
+ if not content.strip():
37
+ return []
38
+
39
+ logger.info(f"Generating up to {max_tags} tags for content")
40
+
41
+ tags = await self.llm_service.generate_tags(content, max_tags)
42
+
43
+ logger.info(f"Generated {len(tags)} tags")
44
+ return tags
45
+
46
+ except Exception as e:
47
+ logger.error(f"Error generating tags: {str(e)}")
48
+ return []
49
+
50
+ async def categorize(self, content: str, categories: List[str]) -> str:
51
+ """Categorize content into one of the provided categories"""
52
+ try:
53
+ if not content.strip():
54
+ return "Uncategorized"
55
+
56
+ if not categories:
57
+ categories = ["Technology", "Business", "Science", "Education", "Entertainment", "News", "Research", "Other"]
58
+
59
+ logger.info(f"Categorizing content into one of {len(categories)} categories")
60
+
61
+ category = await self.llm_service.categorize(content, categories)
62
+
63
+ logger.info(f"Categorized as: {category}")
64
+ return category
65
+
66
+ except Exception as e:
67
+ logger.error(f"Error categorizing content: {str(e)}")
68
+ return "Uncategorized"
69
+
70
+ async def answer_question(self, question: str, context_results: List[SearchResult] = None) -> str:
71
+ """Answer a question using the provided context or RAG"""
72
+ try:
73
+ if not question.strip():
74
+ return "No question provided."
75
+
76
+ logger.info(f"Answering question: {question[:100]}...")
77
+
78
+ # If no context provided and search tool is available, search for relevant context
79
+ if not context_results and self.search_tool:
80
+ logger.info("No context provided, searching for relevant information")
81
+ context_results = await self.search_tool.search(question, top_k=5)
82
+
83
+ # Prepare context from search results
84
+ if context_results:
85
+ context_texts = []
86
+ for result in context_results:
87
+ context_texts.append(f"Source: {result.document_id}\nContent: {result.content}\n")
88
+
89
+ context = "\n---\n".join(context_texts)
90
+ logger.info(f"Using context from {len(context_results)} sources")
91
+ else:
92
+ context = ""
93
+ logger.info("No context available for answering question")
94
+
95
+ # Generate answer
96
+ answer = await self.llm_service.answer_question(question, context)
97
+
98
+ logger.info(f"Generated answer of length {len(answer)}")
99
+ return answer
100
+
101
+ except Exception as e:
102
+ logger.error(f"Error answering question: {str(e)}")
103
+ return f"I encountered an error while trying to answer your question: {str(e)}"
104
+
105
+ async def generate_outline(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> str:
106
+ """Generate an outline for the given topic"""
107
+ try:
108
+ if not topic.strip():
109
+ return "No topic provided."
110
+
111
+ detail_descriptions = {
112
+ "brief": "brief bullet points",
113
+ "medium": "detailed bullet points with descriptions",
114
+ "detailed": "comprehensive outline with sub-sections and explanations"
115
+ }
116
+
117
+ detail_desc = detail_descriptions.get(detail_level, "detailed bullet points")
118
+
119
+ prompt = f"""Create a {detail_desc} outline for the topic: "{topic}"
120
+
121
+ The outline should have {num_sections} main sections and be well-structured and informative.
122
+
123
+ Format the outline clearly with proper numbering and indentation.
124
+
125
+ Topic: {topic}
126
+
127
+ Outline:"""
128
+
129
+ outline = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.7)
130
+
131
+ logger.info(f"Generated outline for topic: {topic}")
132
+ return outline
133
+
134
+ except Exception as e:
135
+ logger.error(f"Error generating outline: {str(e)}")
136
+ return f"Error generating outline: {str(e)}"
137
+
138
+ async def explain_concept(self, concept: str, audience: str = "general", length: str = "medium") -> str:
139
+ """Explain a concept for a specific audience"""
140
+ try:
141
+ if not concept.strip():
142
+ return "No concept provided."
143
+
144
+ audience_styles = {
145
+ "general": "a general audience using simple, clear language",
146
+ "technical": "a technical audience with appropriate jargon and detail",
147
+ "beginner": "beginners with no prior knowledge, using analogies and examples",
148
+ "expert": "experts in the field with advanced terminology and depth"
149
+ }
150
+
151
+ length_guidance = {
152
+ "brief": "Keep the explanation concise and to the point (2-3 paragraphs).",
153
+ "medium": "Provide a comprehensive explanation (4-6 paragraphs).",
154
+ "detailed": "Give a thorough, in-depth explanation with examples."
155
+ }
156
+
157
+ audience_desc = audience_styles.get(audience, "a general audience")
158
+ length_desc = length_guidance.get(length, "Provide a comprehensive explanation.")
159
+
160
+ prompt = f"""Explain the concept of "{concept}" for {audience_desc}.
161
+
162
+ {length_desc}
163
+
164
+ Make sure to:
165
+ - Use appropriate language for the audience
166
+ - Include relevant examples or analogies
167
+ - Structure the explanation logically
168
+ - Ensure clarity and accuracy
169
+
170
+ Concept to explain: {concept}
171
+
172
+ Explanation:"""
173
+
174
+ explanation = await self.llm_service.generate_text(prompt, max_tokens=600, temperature=0.5)
175
+
176
+ logger.info(f"Generated explanation for concept: {concept}")
177
+ return explanation
178
+
179
+ except Exception as e:
180
+ logger.error(f"Error explaining concept: {str(e)}")
181
+ return f"Error explaining concept: {str(e)}"
182
+
183
+ async def compare_concepts(self, concept1: str, concept2: str, aspects: List[str] = None) -> str:
184
+ """Compare two concepts across specified aspects"""
185
+ try:
186
+ if not concept1.strip() or not concept2.strip():
187
+ return "Both concepts must be provided for comparison."
188
+
189
+ if not aspects:
190
+ aspects = ["definition", "key features", "advantages", "disadvantages", "use cases"]
191
+
192
+ aspects_str = ", ".join(aspects)
193
+
194
+ prompt = f"""Compare and contrast "{concept1}" and "{concept2}" across the following aspects: {aspects_str}.
195
+
196
+ Structure your comparison clearly, addressing each aspect for both concepts.
197
+
198
+ Format:
199
+ ## Comparison: {concept1} vs {concept2}
200
+
201
+ For each aspect, provide:
202
+ - **{concept1}**: [description]
203
+ - **{concept2}**: [description]
204
+ - **Key Difference**: [summary]
205
+
206
+ For each aspect, provide:
207
+ - **{concept1}**: [description]
208
+ - **{concept2}**: [description]
209
+ - **Key Difference**: [summary]
210
+
211
+ Concepts to compare:
212
+ 1. {concept1}
213
+ 2. {concept2}
214
+
215
+ Comparison:"""
216
+
217
+ comparison = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.6)
218
+
219
+ logger.info(f"Generated comparison between {concept1} and {concept2}")
220
+ return comparison
221
+
222
+ except Exception as e:
223
+ logger.error(f"Error comparing concepts: {str(e)}")
224
+ return f"Error comparing concepts: {str(e)}"
225
+
226
+ async def generate_questions(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> List[str]:
227
+ """Generate questions based on the provided content"""
228
+ try:
229
+ if not content.strip():
230
+ return []
231
+
232
+ question_types = {
233
+ "comprehension": "comprehension questions that test understanding of key concepts",
234
+ "analysis": "analytical questions that require deeper thinking and evaluation",
235
+ "application": "application questions that ask how to use the concepts in practice",
236
+ "creative": "creative questions that encourage original thinking and exploration",
237
+ "factual": "factual questions about specific details and information"
238
+ }
239
+
240
+ question_desc = question_types.get(question_type, "comprehension questions")
241
+
242
+ prompt = f"""Based on the following content, generate {num_questions} {question_desc}.
243
+
244
+ The questions should be:
245
+ - Clear and well-formulated
246
+ - Relevant to the content
247
+ - Appropriate for the specified type
248
+ - Engaging and thought-provoking
249
+
250
+ Content:
251
+ {content[:2000]} # Limit content length
252
+
253
+ Questions:"""
254
+
255
+ response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.7)
256
+
257
+ # Parse questions from response
258
+ questions = []
259
+ lines = response.split('\n')
260
+
261
+ for line in lines:
262
+ line = line.strip()
263
+ if line and ('?' in line or line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*'))):
264
+ # Clean up the question
265
+ question = line.lstrip('0123456789.-* ').strip()
266
+ if question and '?' in question:
267
+ questions.append(question)
268
+
269
+ logger.info(f"Generated {len(questions)} {question_type} questions")
270
+ return questions[:num_questions]
271
+
272
+ except Exception as e:
273
+ logger.error(f"Error generating questions: {str(e)}")
274
+ return []
275
+
276
+ def _chunk_text(self, text: str, chunk_size: int = 2000) -> List[str]:
277
+ """Split text into chunks respecting paragraph boundaries"""
278
+ if len(text) <= chunk_size:
279
+ return [text]
280
+
281
+ chunks = []
282
+ current_chunk = ""
283
+
284
+ # Split by paragraphs first
285
+ paragraphs = text.split('\n\n')
286
+
287
+ for para in paragraphs:
288
+ if len(current_chunk) + len(para) + 2 <= chunk_size:
289
+ current_chunk += para + "\n\n"
290
+ else:
291
+ if current_chunk:
292
+ chunks.append(current_chunk.strip())
293
+ current_chunk = para + "\n\n"
294
+
295
+ # If a single paragraph is too long, split it by sentences
296
+ if len(current_chunk) > chunk_size:
297
+ # Reset current_chunk and split the long paragraph
298
+ long_para = current_chunk.strip()
299
+ current_chunk = ""
300
+
301
+ sentences = long_para.replace('. ', '.\n').split('\n')
302
+ sub_chunk = ""
303
+ for sentence in sentences:
304
+ if len(sub_chunk) + len(sentence) + 1 <= chunk_size:
305
+ sub_chunk += sentence + " "
306
+ else:
307
+ if sub_chunk:
308
+ chunks.append(sub_chunk.strip())
309
+ sub_chunk = sentence + " "
310
+ if sub_chunk:
311
+ current_chunk = sub_chunk # Carry over remaining part
312
+
313
+ if current_chunk:
314
+ chunks.append(current_chunk.strip())
315
+
316
+ return chunks
317
+
318
+ async def paraphrase_text(self, text: str, style: str = "formal", preserve_meaning: bool = True) -> str:
319
+ """Paraphrase text in a different style while preserving meaning"""
320
+ try:
321
+ if not text.strip():
322
+ return "No text provided for paraphrasing."
323
+
324
+ # Check length and chunk if necessary
325
+ MAX_CHUNK_SIZE = 2500
326
+ if len(text) > MAX_CHUNK_SIZE:
327
+ logger.info(f"Text length {len(text)} exceeds limit, chunking...")
328
+ chunks = self._chunk_text(text, MAX_CHUNK_SIZE)
329
+ logger.info(f"Split into {len(chunks)} chunks")
330
+
331
+ paraphrased_chunks = []
332
+ for i, chunk in enumerate(chunks):
333
+ logger.info(f"Processing chunk {i+1}/{len(chunks)}")
334
+ # Process chunk
335
+ chunk_result = await self.paraphrase_text(chunk, style, preserve_meaning)
336
+ paraphrased_chunks.append(chunk_result)
337
+ # Small delay to be nice to rate limits
338
+ await asyncio.sleep(0.5)
339
+
340
+ return "\n\n".join(paraphrased_chunks)
341
+
342
+ style_instructions = {
343
+ "formal": "formal, professional language",
344
+ "casual": "casual, conversational language",
345
+ "academic": "academic, scholarly language",
346
+ "simple": "simple, easy-to-understand language",
347
+ "technical": "technical, precise language"
348
+ }
349
+
350
+ style_desc = style_instructions.get(style, "clear, appropriate language")
351
+ meaning_instruction = "while preserving the exact meaning and key information" if preserve_meaning else "while maintaining the general intent"
352
+
353
+ prompt = f"""Paraphrase the following text using {style_desc} {meaning_instruction}.
354
+
355
+ Original text:
356
+ {text}
357
+
358
+ Paraphrased text:"""
359
+
360
+ paraphrase = await self.llm_service.generate_text(prompt, max_tokens=len(text.split()) * 2, temperature=0.6)
361
+
362
+ logger.info(f"Paraphrased text in {style} style")
363
+ return paraphrase.strip()
364
+
365
+ except Exception as e:
366
+ logger.error(f"Error paraphrasing text: {str(e)}")
367
+ return f"Error paraphrasing text: {str(e)}"
368
+
369
+ async def extract_key_insights(self, content: str, num_insights: int = 5) -> List[str]:
370
+ """Extract key insights from the provided content"""
371
+ try:
372
+ if not content.strip():
373
+ return []
374
+
375
+ prompt = f'''Analyze the following content and extract {num_insights} key insights or takeaways.
376
+
377
+ Each insight should be:
378
+ - A clear, concise statement
379
+ - Significant and meaningful
380
+ - Based on the content provided
381
+ - Actionable or thought-provoking when possible
382
+
383
+ Content:
384
+ {content[:3000]} # Limit content length
385
+
386
+ Key Insights:'''
387
+
388
+ response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.6)
389
+
390
+ # Parse insights from response
391
+ insights = []
392
+ lines = response.split('\n')
393
+
394
+ for line in lines:
395
+ line = line.strip()
396
+ if line and (line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*')) or len(insights) == 0):
397
+ # Clean up the insight
398
+ insight = line.lstrip('0123456789.-* ').strip()
399
+ if insight and len(insight) > 10: # Minimum insight length
400
+ insights.append(insight)
401
+
402
+ logger.info(f"Extracted {len(insights)} key insights")
403
+ return insights[:num_insights]
404
+
405
+ except Exception as e:
406
+ logger.error(f"Error extracting insights: {str(e)}")
407
+ return []
mcp_tools/ingestion_tool.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from typing import Dict, Any, Optional
4
+ import tempfile
5
+ import os
6
+ from pathlib import Path
7
+ import uuid
8
+
9
+ from core.document_parser import DocumentParser
10
+ from core.chunker import TextChunker
11
+ from core.text_preprocessor import TextPreprocessor
12
+ from services.vector_store_service import VectorStoreService
13
+ from services.document_store_service import DocumentStoreService
14
+ from services.embedding_service import EmbeddingService
15
+ from services.ocr_service import OCRService
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class IngestionTool:
20
+ def __init__(self, vector_store: VectorStoreService, document_store: DocumentStoreService,
21
+ embedding_service: EmbeddingService, ocr_service: OCRService):
22
+ self.vector_store = vector_store
23
+ self.document_store = document_store
24
+ self.embedding_service = embedding_service
25
+ self.ocr_service = ocr_service
26
+
27
+ self.document_parser = DocumentParser()
28
+ # Pass OCR service to document parser
29
+ self.document_parser.ocr_service = ocr_service
30
+
31
+ self.text_chunker = TextChunker()
32
+ self.text_preprocessor = TextPreprocessor()
33
+
34
+ async def process_document(self, file_path: str, file_type: str, task_id: Optional[str] = None) -> Dict[str, Any]:
35
+ """Process a document through the full ingestion pipeline"""
36
+ if task_id is None:
37
+ task_id = str(uuid.uuid4())
38
+
39
+ try:
40
+ logger.info(f"Starting document processing for {file_path}")
41
+
42
+ # Step 1: Parse the document
43
+ filename = Path(file_path).name
44
+ document = await self.document_parser.parse_document(file_path, filename)
45
+
46
+ if not document.content:
47
+ logger.warning(f"No content extracted from document {filename}")
48
+ return {
49
+ "success": False,
50
+ "error": "No content could be extracted from the document",
51
+ "task_id": task_id
52
+ }
53
+
54
+ # Step 2: Store the document
55
+ await self.document_store.store_document(document)
56
+
57
+ # Step 3: Process content for embeddings
58
+ chunks = await self._create_and_embed_chunks(document)
59
+
60
+ if not chunks:
61
+ logger.warning(f"No chunks created for document {document.id}")
62
+ return {
63
+ "success": False,
64
+ "error": "Failed to create text chunks",
65
+ "task_id": task_id,
66
+ "document_id": document.id,
67
+ "filename": document.filename,
68
+ "chunks_created": len(chunks),
69
+ "content_length": len(document.content),
70
+ "doc_type": document.doc_type.value,
71
+ "message": f"Successfully processed {filename}"
72
+ }
73
+
74
+ # Step 4: Store embeddings
75
+ success = await self.vector_store.add_chunks(chunks)
76
+
77
+ if not success:
78
+ logger.error(f"Failed to store embeddings for document {document.id}")
79
+ return {
80
+ "success": False,
81
+ "error": "Failed to store embeddings",
82
+ "task_id": task_id,
83
+ "document_id": document.id
84
+ }
85
+
86
+ # Step 5: Update document metadata with chunk count
87
+ try:
88
+ current_metadata = document.metadata or {}
89
+ current_metadata["chunk_count"] = len(chunks)
90
+ await self.document_store.update_document_metadata(
91
+ document.id,
92
+ {"metadata": current_metadata}
93
+ )
94
+ except Exception as e:
95
+ logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
96
+
97
+ logger.info(f"Successfully processed document {document.id} with {len(chunks)} chunks")
98
+
99
+ return {
100
+ "success": True,
101
+ "task_id": task_id,
102
+ "document_id": document.id,
103
+ "filename": document.filename,
104
+ "chunks_created": len(chunks),
105
+ "content_length": len(document.content),
106
+ "doc_type": document.doc_type.value,
107
+ "message": f"Successfully processed {filename}"
108
+ }
109
+
110
+ except Exception as e:
111
+ logger.error(f"Error processing document {file_path}: {str(e)}")
112
+ return {
113
+ "success": False,
114
+ "error": str(e),
115
+ "task_id": task_id,
116
+ "message": f"Failed to process document: {str(e)}"
117
+ }
118
+
119
+ async def _create_and_embed_chunks(self, document) -> list:
120
+ """Create chunks and generate embeddings"""
121
+ try:
122
+ # Step 1: Create chunks
123
+ chunks = self.text_chunker.chunk_document(
124
+ document.id,
125
+ document.content,
126
+ method="recursive"
127
+ )
128
+
129
+ if not chunks:
130
+ return []
131
+
132
+ # Step 2: Optimize chunks for embedding
133
+ optimized_chunks = self.text_chunker.optimize_chunks_for_embedding(chunks)
134
+
135
+ # Step 3: Generate embeddings
136
+ texts = [chunk.content for chunk in optimized_chunks]
137
+ embeddings = await self.embedding_service.generate_embeddings(texts)
138
+
139
+ # Step 4: Add embeddings to chunks
140
+ embedded_chunks = []
141
+ for i, chunk in enumerate(optimized_chunks):
142
+ if i < len(embeddings):
143
+ chunk.embedding = embeddings[i]
144
+ embedded_chunks.append(chunk)
145
+
146
+ return embedded_chunks
147
+
148
+ except Exception as e:
149
+ logger.error(f"Error creating and embedding chunks: {str(e)}")
150
+ return []
151
+
152
+ async def process_url(self, url: str, task_id: Optional[str] = None) -> Dict[str, Any]:
153
+ """Process a document from a URL"""
154
+ try:
155
+ import requests
156
+ from urllib.parse import urlparse
157
+
158
+ # Download the file
159
+ response = requests.get(url, timeout=30)
160
+ response.raise_for_status()
161
+
162
+ # Determine file type from URL or content-type
163
+ parsed_url = urlparse(url)
164
+ filename = Path(parsed_url.path).name or "downloaded_file"
165
+
166
+ # Create temporary file
167
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
168
+ tmp_file.write(response.content)
169
+ tmp_file_path = tmp_file.name
170
+
171
+ try:
172
+ # Process the downloaded file
173
+ result = await self.process_document(tmp_file_path, "", task_id)
174
+ result["source_url"] = url
175
+ return result
176
+ finally:
177
+ # Clean up temporary file
178
+ if os.path.exists(tmp_file_path):
179
+ os.unlink(tmp_file_path)
180
+
181
+ except Exception as e:
182
+ logger.error(f"Error processing URL {url}: {str(e)}")
183
+ return {
184
+ "success": False,
185
+ "error": str(e),
186
+ "task_id": task_id or str(uuid.uuid4()),
187
+ "source_url": url
188
+ }
189
+
190
+ async def process_text_content(self, content: str, filename: str = "text_content.txt",
191
+ task_id: Optional[str] = None) -> Dict[str, Any]:
192
+ """Process raw text content directly"""
193
+ try:
194
+ from core.models import Document, DocumentType
195
+ from datetime import datetime
196
+
197
+ # Create document object
198
+ document = Document(
199
+ id=str(uuid.uuid4()),
200
+ filename=filename,
201
+ content=content,
202
+ doc_type=DocumentType.TEXT,
203
+ file_size=len(content.encode('utf-8')),
204
+ created_at=datetime.utcnow(),
205
+ metadata={
206
+ "source": "direct_text_input",
207
+ "content_length": len(content),
208
+ "word_count": len(content.split())
209
+ }
210
+ )
211
+
212
+ # Store the document
213
+ await self.document_store.store_document(document)
214
+
215
+ # Process content for embeddings
216
+ chunks = await self._create_and_embed_chunks(document)
217
+
218
+ if chunks:
219
+ await self.vector_store.add_chunks(chunks)
220
+
221
+ # Update document metadata with chunk count
222
+ try:
223
+ current_metadata = document.metadata or {}
224
+ current_metadata["chunk_count"] = len(chunks)
225
+ await self.document_store.update_document_metadata(
226
+ document.id,
227
+ {"metadata": current_metadata}
228
+ )
229
+ except Exception as e:
230
+ logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
231
+
232
+ return {
233
+ "success": True,
234
+ "task_id": task_id or str(uuid.uuid4()),
235
+ "document_id": document.id,
236
+ "filename": filename,
237
+ "chunks_created": len(chunks),
238
+ "content_length": len(content),
239
+ "message": f"Successfully processed text content"
240
+ }
241
+
242
+ except Exception as e:
243
+ logger.error(f"Error processing text content: {str(e)}")
244
+ return {
245
+ "success": False,
246
+ "error": str(e),
247
+ "task_id": task_id or str(uuid.uuid4())
248
+ }
249
+
250
+ async def reprocess_document(self, document_id: str, task_id: Optional[str] = None) -> Dict[str, Any]:
251
+ """Reprocess an existing document (useful for updating embeddings)"""
252
+ try:
253
+ # Get the document
254
+ document = await self.document_store.get_document(document_id)
255
+
256
+ if not document:
257
+ return {
258
+ "success": False,
259
+ "error": f"Document {document_id} not found",
260
+ "task_id": task_id or str(uuid.uuid4())
261
+ }
262
+
263
+ # Remove existing chunks from vector store
264
+ await self.vector_store.delete_document(document_id)
265
+
266
+ # Recreate and embed chunks
267
+ chunks = await self._create_and_embed_chunks(document)
268
+
269
+ if chunks:
270
+ await self.vector_store.add_chunks(chunks)
271
+
272
+ # Update document metadata with chunk count
273
+ try:
274
+ current_metadata = document.metadata or {}
275
+ current_metadata["chunk_count"] = len(chunks)
276
+ await self.document_store.update_document_metadata(
277
+ document.id,
278
+ {"metadata": current_metadata}
279
+ )
280
+ except Exception as e:
281
+ logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
282
+
283
+ return {
284
+ "success": True,
285
+ "task_id": task_id or str(uuid.uuid4()),
286
+ "document_id": document_id,
287
+ "filename": document.filename,
288
+ "chunks_created": len(chunks),
289
+ "message": f"Successfully reprocessed {document.filename}"
290
+ }
291
+
292
+ except Exception as e:
293
+ logger.error(f"Error reprocessing document {document_id}: {str(e)}")
294
+ return {
295
+ "success": False,
296
+ "error": str(e),
297
+ "task_id": task_id or str(uuid.uuid4()),
298
+ "document_id": document_id
299
+ }
300
+
301
+ async def batch_process_directory(self, directory_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
302
+ """Process multiple documents from a directory"""
303
+ try:
304
+ directory = Path(directory_path)
305
+ if not directory.exists() or not directory.is_dir():
306
+ return {
307
+ "success": False,
308
+ "error": f"Directory {directory_path} does not exist",
309
+ "task_id": task_id or str(uuid.uuid4())
310
+ }
311
+
312
+ # Supported file extensions
313
+ supported_extensions = {'.txt', '.pdf', '.docx', '.png', '.jpg', '.jpeg', '.bmp', '.tiff'}
314
+
315
+ # Find all supported files
316
+ files_to_process = []
317
+ for ext in supported_extensions:
318
+ files_to_process.extend(directory.glob(f"*{ext}"))
319
+ files_to_process.extend(directory.glob(f"*{ext.upper()}"))
320
+
321
+ if not files_to_process:
322
+ return {
323
+ "success": False,
324
+ "error": "No supported files found in directory",
325
+ "task_id": task_id or str(uuid.uuid4())
326
+ }
327
+
328
+ # Process files
329
+ results = []
330
+ successful = 0
331
+ failed = 0
332
+
333
+ for file_path in files_to_process:
334
+ try:
335
+ result = await self.process_document(str(file_path), file_path.suffix)
336
+ results.append(result)
337
+
338
+ if result.get("success"):
339
+ successful += 1
340
+ else:
341
+ failed += 1
342
+
343
+ except Exception as e:
344
+ failed += 1
345
+ results.append({
346
+ "success": False,
347
+ "error": str(e),
348
+ "filename": file_path.name
349
+ })
350
+
351
+ return {
352
+ "success": True,
353
+ "task_id": task_id or str(uuid.uuid4()),
354
+ "directory": str(directory),
355
+ "total_files": len(files_to_process),
356
+ "successful": successful,
357
+ "failed": failed,
358
+ "results": results,
359
+ "message": f"Processed {successful}/{len(files_to_process)} files successfully"
360
+ }
361
+
362
+ except Exception as e:
363
+ logger.error(f"Error batch processing directory {directory_path}: {str(e)}")
364
+ return {
365
+ "success": False,
366
+ "error": str(e),
367
+ "task_id": task_id or str(uuid.uuid4())
368
+ }
mcp_tools/podcast_tool.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, Any, List
3
+ from dataclasses import asdict
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class PodcastTool:
8
+ """
9
+ MCP Tool for podcast generation from documents
10
+ """
11
+
12
+ def __init__(self, podcast_generator):
13
+ """
14
+ Initialize Podcast Tool
15
+
16
+ Args:
17
+ podcast_generator: PodcastGeneratorService instance
18
+ """
19
+ self.podcast_generator = podcast_generator
20
+
21
+ async def generate_podcast(
22
+ self,
23
+ document_ids: List[str],
24
+ style: str = "conversational",
25
+ duration_minutes: int = 10,
26
+ host1_voice: str = "Rachel",
27
+ host2_voice: str = "Adam"
28
+ ) -> Dict[str, Any]:
29
+ """
30
+ MCP Tool: Generate podcast from documents
31
+
32
+ Args:
33
+ document_ids: List of document IDs to generate podcast from
34
+ style: Podcast style (conversational, educational, technical, casual)
35
+ duration_minutes: Target duration in minutes
36
+ host1_voice: Voice name for first host
37
+ host2_voice: Voice name for second host
38
+
39
+ Returns:
40
+ Dictionary with podcast ID, audio URL, transcript, and metadata
41
+ """
42
+ try:
43
+ if not document_ids or len(document_ids) == 0:
44
+ return {
45
+ "success": False,
46
+ "error": "No documents provided. Please select at least one document."
47
+ }
48
+
49
+ logger.info(f"Generating podcast from {len(document_ids)} documents")
50
+
51
+ # Generate podcast using service
52
+ result = await self.podcast_generator.generate_podcast(
53
+ document_ids=document_ids,
54
+ style=style,
55
+ duration_minutes=duration_minutes,
56
+ host1_voice=host1_voice,
57
+ host2_voice=host2_voice
58
+ )
59
+
60
+ if result.success:
61
+ return {
62
+ "success": True,
63
+ "podcast_id": result.podcast_id,
64
+ "audio_file": result.audio_file_path,
65
+ "audio_url": f"/data/podcasts/{result.podcast_id}.mp3",
66
+ "transcript": result.transcript,
67
+ "metadata": asdict(result.metadata) if result.metadata else {},
68
+ "generation_time": result.generation_time,
69
+ "message": f"Podcast generated successfully! Duration: {result.metadata.duration_seconds/60:.1f} minutes"
70
+ }
71
+ else:
72
+ return {
73
+ "success": False,
74
+ "error": result.error or "Unknown error during podcast generation"
75
+ }
76
+
77
+ except Exception as e:
78
+ logger.error(f"Podcast generation failed: {str(e)}")
79
+ return {
80
+ "success": False,
81
+ "error": str(e)
82
+ }
83
+
84
+ async def generate_transcript(
85
+ self,
86
+ document_ids: List[str],
87
+ style: str = "conversational",
88
+ duration_minutes: int = 10
89
+ ) -> Dict[str, Any]:
90
+ """
91
+ MCP Tool: Generate podcast transcript ONLY (no audio)
92
+
93
+ Args:
94
+ document_ids: List of document IDs
95
+ style: Podcast style
96
+ duration_minutes: Target duration
97
+
98
+ Returns:
99
+ Dictionary with transcript and analysis
100
+ """
101
+ try:
102
+ if not document_ids:
103
+ return {"success": False, "error": "No documents provided"}
104
+
105
+ logger.info(f"Generating transcript for {len(document_ids)} docs")
106
+
107
+ # 1. Analyze
108
+ analysis = await self.podcast_generator.analyze_documents(document_ids)
109
+
110
+ # 2. Generate Script
111
+ script = await self.podcast_generator.generate_script(
112
+ analysis, style, duration_minutes
113
+ )
114
+
115
+ return {
116
+ "success": True,
117
+ "transcript": script.to_text(),
118
+ "word_count": script.word_count,
119
+ "estimated_duration": script.total_duration_estimate,
120
+ "key_insights": analysis.key_insights,
121
+ "topics": analysis.topics
122
+ }
123
+
124
+ except Exception as e:
125
+ logger.error(f"Transcript generation failed: {str(e)}")
126
+ return {"success": False, "error": str(e)}
127
+
128
+ def get_podcast_audio(self, podcast_id: str) -> Dict[str, Any]:
129
+ """
130
+ MCP Tool: Get audio file path for a podcast
131
+
132
+ Args:
133
+ podcast_id: Podcast ID
134
+
135
+ Returns:
136
+ Dictionary with audio file path
137
+ """
138
+ try:
139
+ podcast = self.podcast_generator.get_podcast(podcast_id)
140
+ if not podcast:
141
+ return {"success": False, "error": "Podcast not found"}
142
+
143
+ # Construct absolute path (assuming local running)
144
+ # In a real remote setup, this might return a URL
145
+ audio_path = f"/data/podcasts/{podcast_id}.mp3"
146
+
147
+ return {
148
+ "success": True,
149
+ "podcast_id": podcast_id,
150
+ "audio_path": audio_path,
151
+ "exists": True
152
+ }
153
+ except Exception as e:
154
+ logger.error(f"Failed to get audio path: {str(e)}")
155
+ return {"success": False, "error": str(e)}
156
+
157
+ def list_podcasts(self, limit: int = 10) -> Dict[str, Any]:
158
+ """
159
+ List previously generated podcasts
160
+
161
+ Args:
162
+ limit: Maximum number of podcasts to return
163
+
164
+ Returns:
165
+ Dictionary with list of podcast metadata
166
+ """
167
+ try:
168
+ podcasts = self.podcast_generator.list_podcasts(limit=limit)
169
+
170
+ return {
171
+ "success": True,
172
+ "podcasts": [asdict(p) for p in podcasts],
173
+ "total": len(podcasts)
174
+ }
175
+ except Exception as e:
176
+ logger.error(f"Failed to list podcasts: {str(e)}")
177
+ return {
178
+ "success": False,
179
+ "error": str(e),
180
+ "podcasts": []
181
+ }
182
+
183
+ def get_podcast(self, podcast_id: str) -> Dict[str, Any]:
184
+ """
185
+ Get specific podcast by ID
186
+
187
+ Args:
188
+ podcast_id: Podcast identifier
189
+
190
+ Returns:
191
+ Dictionary with podcast metadata
192
+ """
193
+ try:
194
+ podcast = self.podcast_generator.get_podcast(podcast_id)
195
+
196
+ if podcast:
197
+ return {
198
+ "success": True,
199
+ "podcast": asdict(podcast)
200
+ }
201
+ else:
202
+ return {
203
+ "success": False,
204
+ "error": "Podcast not found"
205
+ }
206
+ except Exception as e:
207
+ logger.error(f"Failed to get podcast: {str(e)}")
208
+ return {
209
+ "success": False,
210
+ "error": str(e)
211
+ }
mcp_tools/search_tool.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Any, Optional
3
+ import asyncio
4
+
5
+ from core.models import SearchResult
6
+ from services.vector_store_service import VectorStoreService
7
+ from services.embedding_service import EmbeddingService
8
+ from services.document_store_service import DocumentStoreService
9
+ import config
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class SearchTool:
14
+ def __init__(self, vector_store: VectorStoreService, embedding_service: EmbeddingService,
15
+ document_store: Optional[DocumentStoreService] = None, llamaindex_service: Any = None):
16
+ self.vector_store = vector_store
17
+ self.embedding_service = embedding_service
18
+ self.document_store = document_store
19
+ self.llamaindex_service = llamaindex_service
20
+ self.config = config.config
21
+
22
+ async def search(self, query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None,
23
+ similarity_threshold: Optional[float] = None) -> List[SearchResult]:
24
+ """Perform semantic search"""
25
+ try:
26
+ if not query.strip():
27
+ logger.warning("Empty search query provided")
28
+ return []
29
+
30
+ # Use default threshold if not provided
31
+ if similarity_threshold is None:
32
+ similarity_threshold = self.config.SIMILARITY_THRESHOLD
33
+
34
+ logger.info(f"Performing semantic search for: '{query}' (top_k={top_k})")
35
+
36
+ # Generate query embedding
37
+ query_embedding = await self.embedding_service.generate_single_embedding(query)
38
+
39
+ if not query_embedding:
40
+ logger.error("Failed to generate query embedding")
41
+ return []
42
+
43
+ # Perform vector search
44
+ results = await self.vector_store.search(
45
+ query_embedding=query_embedding,
46
+ top_k=top_k,
47
+ filters=filters
48
+ )
49
+
50
+ # Filter by similarity threshold
51
+ filtered_results = [
52
+ result for result in results
53
+ if result.score >= similarity_threshold
54
+ ]
55
+
56
+ logger.info(f"Found {len(filtered_results)} results above threshold {similarity_threshold}")
57
+
58
+ # Enhance results with additional metadata if document store is available
59
+ if self.document_store:
60
+ enhanced_results = await self._enhance_results_with_metadata(filtered_results)
61
+ return enhanced_results
62
+
63
+ return filtered_results
64
+
65
+ except Exception as e:
66
+ logger.error(f"Error performing semantic search: {str(e)}")
67
+ return []
68
+
69
+ async def agentic_search(self, query: str) -> str:
70
+ """Perform agentic search using LlamaIndex"""
71
+ if not self.llamaindex_service:
72
+ logger.warning("LlamaIndex service not available for agentic search")
73
+ return "Agentic search not available."
74
+
75
+ try:
76
+ logger.info(f"Performing agentic search for: '{query}'")
77
+ return await self.llamaindex_service.query(query)
78
+ except Exception as e:
79
+ logger.error(f"Error performing agentic search: {str(e)}")
80
+ return f"Error performing agentic search: {str(e)}"
81
+
82
+ async def _enhance_results_with_metadata(self, results: List[SearchResult]) -> List[SearchResult]:
83
+ """Enhance search results with document metadata"""
84
+ try:
85
+ enhanced_results = []
86
+
87
+ for result in results:
88
+ try:
89
+ # Get document metadata
90
+ document = await self.document_store.get_document(result.document_id)
91
+
92
+ if document:
93
+ # Add document metadata to result
94
+ enhanced_metadata = {
95
+ **result.metadata,
96
+ "document_filename": document.filename,
97
+ "document_type": document.doc_type.value,
98
+ "document_tags": document.tags,
99
+ "document_category": document.category,
100
+ "document_created_at": document.created_at.isoformat(),
101
+ "document_summary": document.summary
102
+ }
103
+
104
+ enhanced_result = SearchResult(
105
+ chunk_id=result.chunk_id,
106
+ document_id=result.document_id,
107
+ content=result.content,
108
+ score=result.score,
109
+ metadata=enhanced_metadata
110
+ )
111
+
112
+ enhanced_results.append(enhanced_result)
113
+ else:
114
+ # Document not found, use original result
115
+ enhanced_results.append(result)
116
+
117
+ except Exception as e:
118
+ logger.warning(f"Error enhancing result {result.chunk_id}: {str(e)}")
119
+ enhanced_results.append(result)
120
+
121
+ return enhanced_results
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error enhancing results: {str(e)}")
125
+ return results
126
+
127
+ async def multi_query_search(self, queries: List[str], top_k: int = 5,
128
+ aggregate_method: str = "merge") -> List[SearchResult]:
129
+ """Perform search with multiple queries and aggregate results"""
130
+ try:
131
+ all_results = []
132
+
133
+ # Perform search for each query
134
+ for query in queries:
135
+ if query.strip():
136
+ query_results = await self.search(query, top_k)
137
+ all_results.extend(query_results)
138
+
139
+ if not all_results:
140
+ return []
141
+
142
+ # Aggregate results
143
+ if aggregate_method == "merge":
144
+ return await self._merge_results(all_results, top_k)
145
+ elif aggregate_method == "intersect":
146
+ return await self._intersect_results(all_results, top_k)
147
+ elif aggregate_method == "average":
148
+ return await self._average_results(all_results, top_k)
149
+ else:
150
+ # Default to merge
151
+ return await self._merge_results(all_results, top_k)
152
+
153
+ except Exception as e:
154
+ logger.error(f"Error in multi-query search: {str(e)}")
155
+ return []
156
+
157
+ async def _merge_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
158
+ """Merge results and remove duplicates, keeping highest scores"""
159
+ try:
160
+ # Group by chunk_id and keep highest score
161
+ chunk_scores = {}
162
+ chunk_results = {}
163
+
164
+ for result in results:
165
+ chunk_id = result.chunk_id
166
+ if chunk_id not in chunk_scores or result.score > chunk_scores[chunk_id]:
167
+ chunk_scores[chunk_id] = result.score
168
+ chunk_results[chunk_id] = result
169
+
170
+ # Sort by score and return top_k
171
+ merged_results = list(chunk_results.values())
172
+ merged_results.sort(key=lambda x: x.score, reverse=True)
173
+
174
+ return merged_results[:top_k]
175
+
176
+ except Exception as e:
177
+ logger.error(f"Error merging results: {str(e)}")
178
+ return results[:top_k]
179
+
180
+ async def _intersect_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
181
+ """Find chunks that appear in multiple queries"""
182
+ try:
183
+ # Count occurrences of each chunk
184
+ chunk_counts = {}
185
+ chunk_results = {}
186
+
187
+ for result in results:
188
+ chunk_id = result.chunk_id
189
+ chunk_counts[chunk_id] = chunk_counts.get(chunk_id, 0) + 1
190
+
191
+ if chunk_id not in chunk_results or result.score > chunk_results[chunk_id].score:
192
+ chunk_results[chunk_id] = result
193
+
194
+ # Filter chunks that appear more than once
195
+ intersect_results = [
196
+ result for chunk_id, result in chunk_results.items()
197
+ if chunk_counts[chunk_id] > 1
198
+ ]
199
+
200
+ # Sort by score
201
+ intersect_results.sort(key=lambda x: x.score, reverse=True)
202
+
203
+ return intersect_results[:top_k]
204
+
205
+ except Exception as e:
206
+ logger.error(f"Error intersecting results: {str(e)}")
207
+ return []
208
+
209
+ async def _average_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
210
+ """Average scores for chunks that appear multiple times"""
211
+ try:
212
+ # Group by chunk_id and calculate average scores
213
+ chunk_groups = {}
214
+
215
+ for result in results:
216
+ chunk_id = result.chunk_id
217
+ if chunk_id not in chunk_groups:
218
+ chunk_groups[chunk_id] = []
219
+ chunk_groups[chunk_id].append(result)
220
+
221
+ # Calculate average scores
222
+ averaged_results = []
223
+ for chunk_id, group in chunk_groups.items():
224
+ avg_score = sum(r.score for r in group) / len(group)
225
+
226
+ # Use the result with the highest individual score but update the score to average
227
+ best_result = max(group, key=lambda x: x.score)
228
+ averaged_result = SearchResult(
229
+ chunk_id=best_result.chunk_id,
230
+ document_id=best_result.document_id,
231
+ content=best_result.content,
232
+ score=avg_score,
233
+ metadata={
234
+ **best_result.metadata,
235
+ "query_count": len(group),
236
+ "score_range": f"{min(r.score for r in group):.3f}-{max(r.score for r in group):.3f}"
237
+ }
238
+ )
239
+ averaged_results.append(averaged_result)
240
+
241
+ # Sort by average score
242
+ averaged_results.sort(key=lambda x: x.score, reverse=True)
243
+
244
+ return averaged_results[:top_k]
245
+
246
+ except Exception as e:
247
+ logger.error(f"Error averaging results: {str(e)}")
248
+ return results[:top_k]
249
+
250
+ async def search_by_document(self, document_id: str, query: str, top_k: int = 5) -> List[SearchResult]:
251
+ """Search within a specific document"""
252
+ try:
253
+ filters = {"document_id": document_id}
254
+ return await self.search(query, top_k, filters)
255
+
256
+ except Exception as e:
257
+ logger.error(f"Error searching within document {document_id}: {str(e)}")
258
+ return []
259
+
260
+ async def search_by_category(self, category: str, query: str, top_k: int = 5) -> List[SearchResult]:
261
+ """Search within documents of a specific category"""
262
+ try:
263
+ if not self.document_store:
264
+ logger.warning("Document store not available for category search")
265
+ return await self.search(query, top_k)
266
+
267
+ # Get documents in the category
268
+ documents = await self.document_store.list_documents(
269
+ limit=1000, # Adjust as needed
270
+ filters={"category": category}
271
+ )
272
+
273
+ if not documents:
274
+ logger.info(f"No documents found in category '{category}'")
275
+ return []
276
+
277
+ # Extract document IDs
278
+ document_ids = [doc.id for doc in documents]
279
+
280
+ # Search with document ID filter
281
+ filters = {"document_ids": document_ids}
282
+ return await self.search(query, top_k, filters)
283
+
284
+ except Exception as e:
285
+ logger.error(f"Error searching by category {category}: {str(e)}")
286
+ return []
287
+
288
+ async def search_with_date_range(self, query: str, start_date, end_date, top_k: int = 5) -> List[SearchResult]:
289
+ """Search documents within a date range"""
290
+ try:
291
+ if not self.document_store:
292
+ logger.warning("Document store not available for date range search")
293
+ return await self.search(query, top_k)
294
+
295
+ # Get documents in the date range
296
+ documents = await self.document_store.list_documents(
297
+ limit=1000, # Adjust as needed
298
+ filters={
299
+ "created_after": start_date,
300
+ "created_before": end_date
301
+ }
302
+ )
303
+
304
+ if not documents:
305
+ logger.info(f"No documents found in date range")
306
+ return []
307
+
308
+ # Extract document IDs
309
+ document_ids = [doc.id for doc in documents]
310
+
311
+ # Search with document ID filter
312
+ filters = {"document_ids": document_ids}
313
+ return await self.search(query, top_k, filters)
314
+
315
+ except Exception as e:
316
+ logger.error(f"Error searching with date range: {str(e)}")
317
+ return []
318
+
319
+ async def get_search_suggestions(self, partial_query: str, limit: int = 5) -> List[str]:
320
+ """Get search suggestions based on partial query"""
321
+ try:
322
+ # This is a simple implementation
323
+ # In a production system, you might want to use a more sophisticated approach
324
+
325
+ if len(partial_query) < 2:
326
+ return []
327
+
328
+ # Search for the partial query
329
+ results = await self.search(partial_query, top_k=20)
330
+
331
+ # Extract potential query expansions from content
332
+ suggestions = set()
333
+
334
+ for result in results:
335
+ content_words = result.content.lower().split()
336
+ for i, word in enumerate(content_words):
337
+ if partial_query.lower() in word:
338
+ # Add the word itself
339
+ suggestions.add(word.strip('.,!?;:'))
340
+
341
+ # Add phrases that include this word
342
+ if i > 0:
343
+ phrase = f"{content_words[i-1]} {word}".strip('.,!?;:')
344
+ suggestions.add(phrase)
345
+ if i < len(content_words) - 1:
346
+ phrase = f"{word} {content_words[i+1]}".strip('.,!?;:')
347
+ suggestions.add(phrase)
348
+
349
+ # Filter and sort suggestions
350
+ filtered_suggestions = [
351
+ s for s in suggestions
352
+ if len(s) > len(partial_query) and s.startswith(partial_query.lower())
353
+ ]
354
+
355
+ return sorted(filtered_suggestions)[:limit]
356
+
357
+ except Exception as e:
358
+ logger.error(f"Error getting search suggestions: {str(e)}")
359
+ return []
360
+
361
+ async def explain_search(self, query: str, top_k: int = 3) -> Dict[str, Any]:
362
+ """Provide detailed explanation of search process and results"""
363
+ try:
364
+ explanation = {
365
+ "query": query,
366
+ "steps": [],
367
+ "results_analysis": {},
368
+ "performance_metrics": {}
369
+ }
370
+
371
+ # Step 1: Query processing
372
+ explanation["steps"].append({
373
+ "step": "query_processing",
374
+ "description": "Processing and normalizing the search query",
375
+ "details": {
376
+ "original_query": query,
377
+ "cleaned_query": query.strip(),
378
+ "query_length": len(query)
379
+ }
380
+ })
381
+
382
+ # Step 2: Embedding generation
383
+ import time
384
+ start_time = time.time()
385
+
386
+ query_embedding = await self.embedding_service.generate_single_embedding(query)
387
+
388
+ embedding_time = time.time() - start_time
389
+
390
+ explanation["steps"].append({
391
+ "step": "embedding_generation",
392
+ "description": "Converting query to vector embedding",
393
+ "details": {
394
+ "embedding_dimension": len(query_embedding) if query_embedding else 0,
395
+ "generation_time_ms": round(embedding_time * 1000, 2)
396
+ }
397
+ })
398
+
399
+ # Step 3: Vector search
400
+ start_time = time.time()
401
+
402
+ results = await self.vector_store.search(query_embedding, top_k)
403
+
404
+ search_time = time.time() - start_time
405
+
406
+ explanation["steps"].append({
407
+ "step": "vector_search",
408
+ "description": "Searching vector database for similar content",
409
+ "details": {
410
+ "search_time_ms": round(search_time * 1000, 2),
411
+ "results_found": len(results),
412
+ "top_score": results[0].score if results else 0,
413
+ "score_range": f"{min(r.score for r in results):.3f}-{max(r.score for r in results):.3f}" if results else "N/A"
414
+ }
415
+ })
416
+
417
+ # Results analysis
418
+ if results:
419
+ explanation["results_analysis"] = {
420
+ "total_results": len(results),
421
+ "average_score": sum(r.score for r in results) / len(results),
422
+ "unique_documents": len(set(r.document_id for r in results)),
423
+ "content_lengths": [len(r.content) for r in results]
424
+ }
425
+
426
+ # Performance metrics
427
+ explanation["performance_metrics"] = {
428
+ "total_time_ms": round((embedding_time + search_time) * 1000, 2),
429
+ "embedding_time_ms": round(embedding_time * 1000, 2),
430
+ "search_time_ms": round(search_time * 1000, 2)
431
+ }
432
+
433
+ return explanation
434
+
435
+ except Exception as e:
436
+ logger.error(f"Error explaining search: {str(e)}")
437
+ return {"error": str(e)}
mcp_tools/utils.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ import functools
4
+ from typing import Any, Callable, Dict, List, Optional
5
+ import time
6
+ import json
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def async_timer(func: Callable) -> Callable:
12
+ """Decorator to time async function execution"""
13
+ @functools.wraps(func)
14
+ async def wrapper(*args, **kwargs):
15
+ start_time = time.time()
16
+ try:
17
+ result = await func(*args, **kwargs)
18
+ end_time = time.time()
19
+ logger.debug(f"{func.__name__} completed in {end_time - start_time:.3f}s")
20
+ return result
21
+ except Exception as e:
22
+ end_time = time.time()
23
+ logger.error(f"{func.__name__} failed after {end_time - start_time:.3f}s: {str(e)}")
24
+ raise
25
+ return wrapper
26
+
27
+ def retry_async(max_attempts: int = 3, delay: float = 1.0, backoff: float = 2.0):
28
+ """Decorator to retry async functions with exponential backoff"""
29
+ def decorator(func: Callable) -> Callable:
30
+ @functools.wraps(func)
31
+ async def wrapper(*args, **kwargs):
32
+ attempt = 1
33
+ current_delay = delay
34
+
35
+ while attempt <= max_attempts:
36
+ try:
37
+ return await func(*args, **kwargs)
38
+ except Exception as e:
39
+ if attempt == max_attempts:
40
+ logger.error(f"{func.__name__} failed after {max_attempts} attempts: {str(e)}")
41
+ raise
42
+
43
+ logger.warning(f"{func.__name__} attempt {attempt} failed: {str(e)}")
44
+ logger.info(f"Retrying in {current_delay}s...")
45
+
46
+ await asyncio.sleep(current_delay)
47
+ attempt += 1
48
+ current_delay *= backoff
49
+
50
+ return wrapper
51
+ return decorator
52
+
53
+ class MCPToolResponse:
54
+ """Standardized response format for MCP tools"""
55
+
56
+ def __init__(self, success: bool, data: Any = None, error: str = None,
57
+ metadata: Dict[str, Any] = None):
58
+ self.success = success
59
+ self.data = data
60
+ self.error = error
61
+ self.metadata = metadata or {}
62
+ self.timestamp = time.time()
63
+
64
+ def to_dict(self) -> Dict[str, Any]:
65
+ """Convert response to dictionary"""
66
+ result = {
67
+ "success": self.success,
68
+ "timestamp": self.timestamp
69
+ }
70
+
71
+ if self.success:
72
+ result["data"] = self.data
73
+ else:
74
+ result["error"] = self.error
75
+
76
+ if self.metadata:
77
+ result["metadata"] = self.metadata
78
+
79
+ return result
80
+
81
+ @classmethod
82
+ def success_response(cls, data: Any, metadata: Dict[str, Any] = None):
83
+ """Create a success response"""
84
+ return cls(success=True, data=data, metadata=metadata)
85
+
86
+ @classmethod
87
+ def error_response(cls, error: str, metadata: Dict[str, Any] = None):
88
+ """Create an error response"""
89
+ return cls(success=False, error=error, metadata=metadata)
90
+
91
+ def validate_required_params(params: Dict[str, Any], required: List[str]) -> Optional[str]:
92
+ """Validate that required parameters are present"""
93
+ missing = []
94
+ for param in required:
95
+ if param not in params or params[param] is None:
96
+ missing.append(param)
97
+
98
+ if missing:
99
+ return f"Missing required parameters: {', '.join(missing)}"
100
+
101
+ return None
102
+
103
+ def sanitize_filename(filename: str) -> str:
104
+ """Sanitize filename for safe storage"""
105
+ import re
106
+
107
+ # Remove or replace invalid characters
108
+ filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
109
+
110
+ # Remove leading/trailing dots and spaces
111
+ filename = filename.strip('. ')
112
+
113
+ # Limit length
114
+ if len(filename) > 255:
115
+ name, ext = Path(filename).stem, Path(filename).suffix
116
+ max_name_len = 255 - len(ext)
117
+ filename = name[:max_name_len] + ext
118
+
119
+ # Ensure not empty
120
+ if not filename:
121
+ filename = "unnamed_file"
122
+
123
+ return filename
124
+
125
+ def truncate_text(text: str, max_length: int, add_ellipsis: bool = True) -> str:
126
+ """Truncate text to specified length"""
127
+ if len(text) <= max_length:
128
+ return text
129
+
130
+ if add_ellipsis and max_length > 3:
131
+ return text[:max_length - 3] + "..."
132
+ else:
133
+ return text[:max_length]
134
+
135
+ def extract_file_info(file_path: str) -> Dict[str, Any]:
136
+ """Extract information about a file"""
137
+ try:
138
+ path = Path(file_path)
139
+ stat = path.stat()
140
+
141
+ return {
142
+ "filename": path.name,
143
+ "extension": path.suffix.lower(),
144
+ "size_bytes": stat.st_size,
145
+ "size_mb": round(stat.st_size / (1024 * 1024), 2),
146
+ "created_time": stat.st_ctime,
147
+ "modified_time": stat.st_mtime,
148
+ "exists": path.exists(),
149
+ "is_file": path.is_file(),
150
+ "is_dir": path.is_dir()
151
+ }
152
+ except Exception as e:
153
+ return {"error": str(e)}
154
+
155
+ async def batch_process(items: List[Any], processor: Callable, batch_size: int = 10,
156
+ max_concurrent: int = 5) -> List[Any]:
157
+ """Process items in batches with concurrency control"""
158
+ results = []
159
+ semaphore = asyncio.Semaphore(max_concurrent)
160
+
161
+ async def process_item(item):
162
+ async with semaphore:
163
+ return await processor(item)
164
+
165
+ # Process in batches
166
+ for i in range(0, len(items), batch_size):
167
+ batch = items[i:i + batch_size]
168
+ batch_tasks = [process_item(item) for item in batch]
169
+ batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
170
+ results.extend(batch_results)
171
+
172
+ return results
173
+
174
+ def format_file_size(size_bytes: int) -> str:
175
+ """Format file size in human-readable format"""
176
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
177
+ if size_bytes < 1024.0:
178
+ return f"{size_bytes:.1f} {unit}"
179
+ size_bytes /= 1024.0
180
+ return f"{size_bytes:.1f} PB"
181
+
182
+ def calculate_reading_time(text: str, words_per_minute: int = 200) -> int:
183
+ """Calculate estimated reading time in minutes"""
184
+ word_count = len(text.split())
185
+ return max(1, round(word_count / words_per_minute))
186
+
187
+ class ProgressTracker:
188
+ """Track progress of long-running operations"""
189
+
190
+ def __init__(self, total_items: int, description: str = "Processing"):
191
+ self.total_items = total_items
192
+ self.completed_items = 0
193
+ self.description = description
194
+ self.start_time = time.time()
195
+ self.errors = []
196
+
197
+ def update(self, completed: int = 1, error: str = None):
198
+ """Update progress"""
199
+ self.completed_items += completed
200
+ if error:
201
+ self.errors.append(error)
202
+
203
+ def get_progress(self) -> Dict[str, Any]:
204
+ """Get current progress information"""
205
+ elapsed_time = time.time() - self.start_time
206
+ progress_percent = (self.completed_items / self.total_items) * 100 if self.total_items > 0 else 0
207
+
208
+ # Estimate remaining time
209
+ if self.completed_items > 0:
210
+ avg_time_per_item = elapsed_time / self.completed_items
211
+ remaining_items = self.total_items - self.completed_items
212
+ estimated_remaining_time = avg_time_per_item * remaining_items
213
+ else:
214
+ estimated_remaining_time = 0
215
+
216
+ return {
217
+ "description": self.description,
218
+ "total_items": self.total_items,
219
+ "completed_items": self.completed_items,
220
+ "progress_percent": round(progress_percent, 1),
221
+ "elapsed_time_seconds": round(elapsed_time, 1),
222
+ "estimated_remaining_seconds": round(estimated_remaining_time, 1),
223
+ "errors_count": len(self.errors),
224
+ "errors": self.errors[-5:] if self.errors else [] # Last 5 errors
225
+ }
226
+
227
+ def is_complete(self) -> bool:
228
+ """Check if processing is complete"""
229
+ return self.completed_items >= self.total_items
230
+
231
+ def load_json_config(config_path: str, default_config: Dict[str, Any] = None) -> Dict[str, Any]:
232
+ """Load configuration from JSON file with fallback to defaults"""
233
+ try:
234
+ with open(config_path, 'r') as f:
235
+ config = json.load(f)
236
+ logger.info(f"Loaded configuration from {config_path}")
237
+ return config
238
+ except FileNotFoundError:
239
+ logger.warning(f"Configuration file {config_path} not found, using defaults")
240
+ return default_config or {}
241
+ except json.JSONDecodeError as e:
242
+ logger.error(f"Invalid JSON in configuration file {config_path}: {str(e)}")
243
+ return default_config or {}
244
+
245
+ def save_json_config(config: Dict[str, Any], config_path: str) -> bool:
246
+ """Save configuration to JSON file"""
247
+ try:
248
+ # Create directory if it doesn't exist
249
+ Path(config_path).parent.mkdir(parents=True, exist_ok=True)
250
+
251
+ with open(config_path, 'w') as f:
252
+ json.dump(config, f, indent=2)
253
+
254
+ logger.info(f"Saved configuration to {config_path}")
255
+ return True
256
+ except Exception as e:
257
+ logger.error(f"Failed to save configuration to {config_path}: {str(e)}")
258
+ return False
259
+
260
+ class RateLimiter:
261
+ """Simple rate limiter for API calls"""
262
+
263
+ def __init__(self, max_calls: int, time_window: float):
264
+ self.max_calls = max_calls
265
+ self.time_window = time_window
266
+ self.calls = []
267
+
268
+ async def acquire(self):
269
+ """Acquire permission to make a call"""
270
+ now = time.time()
271
+
272
+ # Remove old calls outside the time window
273
+ self.calls = [call_time for call_time in self.calls if now - call_time < self.time_window]
274
+
275
+ # Check if we can make a new call
276
+ if len(self.calls) >= self.max_calls:
277
+ # Wait until we can make a call
278
+ oldest_call = min(self.calls)
279
+ wait_time = self.time_window - (now - oldest_call)
280
+ if wait_time > 0:
281
+ await asyncio.sleep(wait_time)
282
+ return await self.acquire() # Recursive call after waiting
283
+
284
+ # Record this call
285
+ self.calls.append(now)
286
+
287
+ def escape_markdown(text: str) -> str:
288
+ """Escape markdown special characters"""
289
+ import re
290
+
291
+ # Characters that need escaping in markdown
292
+ markdown_chars = r'([*_`\[\]()#+\-!\\])'
293
+ return re.sub(markdown_chars, r'\\\1', text)
294
+
295
+ def create_error_summary(errors: List[Exception]) -> str:
296
+ """Create a summary of multiple errors"""
297
+ if not errors:
298
+ return "No errors"
299
+
300
+ error_counts = {}
301
+ for error in errors:
302
+ error_type = type(error).__name__
303
+ error_counts[error_type] = error_counts.get(error_type, 0) + 1
304
+
305
+ summary_parts = []
306
+ for error_type, count in error_counts.items():
307
+ if count == 1:
308
+ summary_parts.append(f"1 {error_type}")
309
+ else:
310
+ summary_parts.append(f"{count} {error_type}s")
311
+
312
+ return f"Encountered {len(errors)} total errors: " + ", ".join(summary_parts)
313
+
314
+ async def safe_execute(func: Callable, *args, default_return=None, **kwargs):
315
+ """Safely execute a function and return default on error"""
316
+ try:
317
+ if asyncio.iscoroutinefunction(func):
318
+ return await func(*args, **kwargs)
319
+ else:
320
+ return func(*args, **kwargs)
321
+ except Exception as e:
322
+ logger.error(f"Error executing {func.__name__}: {str(e)}")
323
+ return default_return
324
+
325
+ def get_content_preview(content: str, max_length: int = 200) -> str:
326
+ """Get a preview of content for display"""
327
+ if not content:
328
+ return "No content"
329
+
330
+ # Clean up whitespace
331
+ content = ' '.join(content.split())
332
+
333
+ if len(content) <= max_length:
334
+ return content
335
+
336
+ # Try to break at sentence boundary
337
+ preview = content[:max_length]
338
+ last_sentence_end = max(preview.rfind('.'), preview.rfind('!'), preview.rfind('?'))
339
+
340
+ if last_sentence_end > max_length * 0.7: # If we found a good breaking point
341
+ return preview[:last_sentence_end + 1]
342
+ else:
343
+ # Break at word boundary
344
+ last_space = preview.rfind(' ')
345
+ if last_space > max_length * 0.7:
346
+ return preview[:last_space] + "..."
347
+ else:
348
+ return preview + "..."
349
+
350
+ class MemoryUsageTracker:
351
+ """Track memory usage of operations"""
352
+
353
+ def __init__(self):
354
+ self.start_memory = self._get_memory_usage()
355
+
356
+ def _get_memory_usage(self) -> float:
357
+ """Get current memory usage in MB"""
358
+ try:
359
+ import psutil
360
+ process = psutil.Process()
361
+ return process.memory_info().rss / 1024 / 1024 # Convert to MB
362
+ except ImportError:
363
+ return 0.0
364
+
365
+ def get_usage_delta(self) -> float:
366
+ """Get memory usage change since initialization"""
367
+ current_memory = self._get_memory_usage()
368
+ return current_memory - self.start_memory
369
+
370
+ def log_usage(self, operation_name: str):
371
+ """Log current memory usage for an operation"""
372
+ delta = self.get_usage_delta()
373
+ logger.info(f"{operation_name} memory delta: {delta:.1f} MB")
mcp_tools/voice_tool.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, Any, Optional
3
+ import asyncio
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class VoiceTool:
8
+ """
9
+ Enhanced MCP Tool for voice-based Q&A using ElevenLabs conversational AI
10
+
11
+ Improvements:
12
+ - Better error handling and user feedback
13
+ - Support for conversation context
14
+ - Streaming responses support
15
+ - Session management
16
+ """
17
+
18
+ def __init__(self, elevenlabs_service):
19
+ """
20
+ Initialize Voice Tool
21
+
22
+ Args:
23
+ elevenlabs_service: ElevenLabs service instance
24
+ """
25
+ self.elevenlabs_service = elevenlabs_service
26
+
27
+ async def voice_qa(
28
+ self,
29
+ question: str,
30
+ session_id: Optional[str] = None
31
+ ) -> Dict[str, Any]:
32
+ """
33
+ Ask a question using voice assistant (text-based for web UI)
34
+
35
+ Args:
36
+ question: User's question
37
+ session_id: Optional session ID for conversation context
38
+
39
+ Returns:
40
+ Dictionary with answer and metadata
41
+ """
42
+ try:
43
+ # Check if service is available
44
+ if not self.elevenlabs_service or not self.elevenlabs_service.is_available():
45
+ return {
46
+ "success": False,
47
+ "error": "Voice assistant not configured. Please set ELEVENLABS_API_KEY in your .env file.",
48
+ "help": "Get your API key from: https://elevenlabs.io/app/settings/api-keys"
49
+ }
50
+
51
+ if not question or not question.strip():
52
+ return {
53
+ "success": False,
54
+ "error": "Please enter a question"
55
+ }
56
+
57
+ logger.info(f"Voice QA (session: {session_id}): {question}")
58
+
59
+ # Send message through ElevenLabs service
60
+ result = await self.elevenlabs_service.send_text_message(
61
+ message=question,
62
+ session_id=session_id or "default"
63
+ )
64
+
65
+ if result.get("success"):
66
+ return {
67
+ "success": True,
68
+ "question": question,
69
+ "answer": result["answer"],
70
+ "session_id": session_id,
71
+ "mode": "text"
72
+ }
73
+ else:
74
+ return {
75
+ "success": False,
76
+ "error": result.get("error", "Unknown error"),
77
+ "question": question
78
+ }
79
+
80
+ except Exception as e:
81
+ logger.error(f"Voice QA failed: {str(e)}", exc_info=True)
82
+ return {
83
+ "success": False,
84
+ "error": f"An error occurred: {str(e)}",
85
+ "question": question
86
+ }
87
+
88
+ async def start_session(self, session_id: str) -> Dict[str, Any]:
89
+ """
90
+ Start a new voice assistant session
91
+
92
+ Args:
93
+ session_id: Unique session identifier
94
+
95
+ Returns:
96
+ Session start status
97
+ """
98
+ try:
99
+ result = await self.elevenlabs_service.start_conversation(session_id)
100
+ return result
101
+ except Exception as e:
102
+ logger.error(f"Failed to start session: {str(e)}")
103
+ return {
104
+ "success": False,
105
+ "error": str(e)
106
+ }
107
+
108
+ async def end_session(self, session_id: str) -> Dict[str, Any]:
109
+ """
110
+ End a voice assistant session
111
+
112
+ Args:
113
+ session_id: Session identifier
114
+
115
+ Returns:
116
+ Session end status
117
+ """
118
+ try:
119
+ success = await self.elevenlabs_service.end_conversation(session_id)
120
+ return {
121
+ "success": success,
122
+ "message": "Session ended" if success else "Session not found"
123
+ }
124
+ except Exception as e:
125
+ logger.error(f"Failed to end session: {str(e)}")
126
+ return {
127
+ "success": False,
128
+ "error": str(e)
129
+ }
130
+
131
+ def get_conversation_history(self, session_id: str) -> Dict[str, Any]:
132
+ """
133
+ Get conversation history for a session
134
+
135
+ Args:
136
+ session_id: Session identifier
137
+
138
+ Returns:
139
+ Dictionary with conversation history
140
+ """
141
+ try:
142
+ history = self.elevenlabs_service.get_conversation_history(session_id)
143
+ return {
144
+ "success": True,
145
+ "history": history,
146
+ "message_count": len(history)
147
+ }
148
+ except Exception as e:
149
+ logger.error(f"Failed to get history: {str(e)}")
150
+ return {
151
+ "success": False,
152
+ "error": str(e),
153
+ "history": []
154
+ }
155
+
156
+ async def test_connection(self) -> Dict[str, Any]:
157
+ """
158
+ Test voice assistant connection
159
+
160
+ Returns:
161
+ Connection test results
162
+ """
163
+ try:
164
+ if not self.elevenlabs_service:
165
+ return {
166
+ "success": False,
167
+ "message": "Service not initialized"
168
+ }
169
+
170
+ result = await self.elevenlabs_service.test_connection()
171
+ return {
172
+ "success": result["status"] == "success",
173
+ **result
174
+ }
175
+ except Exception as e:
176
+ logger.error(f"Connection test failed: {str(e)}")
177
+ return {
178
+ "success": False,
179
+ "message": str(e)
180
+ }
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr
2
+ portaudio19-dev
requirements.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio[mcp]==5.38.0
2
+ anthropic>=0.7.0
3
+ mistralai
4
+ sentence-transformers>=2.2.2
5
+ transformers>=4.30.0
6
+ torch>=2.0.0
7
+ faiss-cpu>=1.7.4
8
+ numpy>=1.24.0
9
+ pandas>=2.0.0
10
+ PyPDF2>=3.0.0
11
+ python-docx>=0.8.11
12
+ Pillow>=10.0.0
13
+ pytesseract>=0.3.10
14
+ aiofiles>=23.0.0
15
+ pydantic>=2.0.0
16
+ httpx>=0.24.0
17
+ uvicorn[standard]
18
+ python-multipart>=0.0.6
19
+ asyncio-mqtt>=0.11.1
20
+ nest-asyncio>=1.5.6
21
+ fastapi
22
+ fastmcp
23
+ mcp
24
+ openai
25
+ python-dotenv
26
+ llama-index>=0.10.0
27
+ llama-index-llms-openai
28
+ llama-index-llms-anthropic
29
+ llama-index-embeddings-huggingface
30
+ elevenlabs>=1.0.0
31
+ spaces
32
+ websockets>=12.0
33
+ pyaudio>=0.2.14
34
+ requests
services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Services module initialization
services/document_store_service.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import json
3
+ import os
4
+ from typing import List, Dict, Any, Optional
5
+ from pathlib import Path
6
+ import pickle
7
+ from datetime import datetime
8
+ import asyncio
9
+
10
+ from core.models import Document, DocumentType
11
+ import config
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class DocumentStoreService:
16
+ def __init__(self):
17
+ self.config = config.config
18
+ self.store_path = Path(self.config.DOCUMENT_STORE_PATH)
19
+ self.store_path.mkdir(parents=True, exist_ok=True)
20
+
21
+ # Separate paths for metadata and content
22
+ self.metadata_path = self.store_path / "metadata"
23
+ self.content_path = self.store_path / "content"
24
+
25
+ self.metadata_path.mkdir(exist_ok=True)
26
+ self.content_path.mkdir(exist_ok=True)
27
+
28
+ # In-memory cache for frequently accessed documents
29
+ self._cache = {}
30
+ self._cache_size_limit = 100
31
+
32
+ async def store_document(self, document: Document) -> bool:
33
+ """Store a document and its metadata"""
34
+ try:
35
+ # Store metadata
36
+ metadata_file = self.metadata_path / f"{document.id}.json"
37
+ metadata = {
38
+ "id": document.id,
39
+ "filename": document.filename,
40
+ "doc_type": document.doc_type.value,
41
+ "file_size": document.file_size,
42
+ "created_at": document.created_at.isoformat(),
43
+ "metadata": document.metadata,
44
+ "tags": document.tags,
45
+ "summary": document.summary,
46
+ "category": document.category,
47
+ "language": document.language,
48
+ "content_length": len(document.content)
49
+ }
50
+
51
+ with open(metadata_file, 'w', encoding='utf-8') as f:
52
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
53
+
54
+ # Store content separately (can be large)
55
+ content_file = self.content_path / f"{document.id}.txt"
56
+ with open(content_file, 'w', encoding='utf-8') as f:
57
+ f.write(document.content)
58
+
59
+ # Cache the document
60
+ self._add_to_cache(document.id, document)
61
+
62
+ logger.info(f"Stored document {document.id} ({document.filename})")
63
+ return True
64
+
65
+ except Exception as e:
66
+ logger.error(f"Error storing document {document.id}: {str(e)}")
67
+ return False
68
+
69
+ async def get_document(self, document_id: str) -> Optional[Document]:
70
+ """Retrieve a document by ID"""
71
+ try:
72
+ # Check cache first
73
+ if document_id in self._cache:
74
+ return self._cache[document_id]
75
+
76
+ # Load from disk
77
+ metadata_file = self.metadata_path / f"{document_id}.json"
78
+ content_file = self.content_path / f"{document_id}.txt"
79
+
80
+ if not metadata_file.exists() or not content_file.exists():
81
+ return None
82
+
83
+ # Load metadata
84
+ with open(metadata_file, 'r', encoding='utf-8') as f:
85
+ metadata = json.load(f)
86
+
87
+ # Load content
88
+ with open(content_file, 'r', encoding='utf-8') as f:
89
+ content = f.read()
90
+
91
+ # Create document object
92
+ document = Document(
93
+ id=metadata["id"],
94
+ filename=metadata["filename"],
95
+ content=content,
96
+ doc_type=DocumentType(metadata["doc_type"]),
97
+ file_size=metadata["file_size"],
98
+ created_at=datetime.fromisoformat(metadata["created_at"]),
99
+ metadata=metadata.get("metadata", {}),
100
+ tags=metadata.get("tags", []),
101
+ summary=metadata.get("summary"),
102
+ category=metadata.get("category"),
103
+ language=metadata.get("language")
104
+ )
105
+
106
+ # Add to cache
107
+ self._add_to_cache(document_id, document)
108
+
109
+ return document
110
+
111
+ except Exception as e:
112
+ logger.error(f"Error retrieving document {document_id}: {str(e)}")
113
+ return None
114
+
115
+ async def list_documents(self, limit: int = 50, offset: int = 0,
116
+ filters: Optional[Dict[str, Any]] = None) -> List[Document]:
117
+ """List documents with pagination and filtering"""
118
+ try:
119
+ documents = []
120
+ metadata_files = list(self.metadata_path.glob("*.json"))
121
+
122
+ # Sort by creation time (newest first)
123
+ metadata_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
124
+
125
+ # Apply pagination
126
+ start_idx = offset
127
+ end_idx = offset + limit
128
+
129
+ for metadata_file in metadata_files[start_idx:end_idx]:
130
+ try:
131
+ with open(metadata_file, 'r', encoding='utf-8') as f:
132
+ metadata = json.load(f)
133
+
134
+ # Apply filters
135
+ if filters and not self._apply_filters(metadata, filters):
136
+ continue
137
+
138
+ # Load content if needed (for small documents)
139
+ content_file = self.content_path / f"{metadata['id']}.txt"
140
+ if content_file.exists():
141
+ with open(content_file, 'r', encoding='utf-8') as f:
142
+ content = f.read()
143
+ else:
144
+ content = ""
145
+
146
+ document = Document(
147
+ id=metadata["id"],
148
+ filename=metadata["filename"],
149
+ content=content,
150
+ doc_type=DocumentType(metadata["doc_type"]),
151
+ file_size=metadata["file_size"],
152
+ created_at=datetime.fromisoformat(metadata["created_at"]),
153
+ metadata=metadata.get("metadata", {}),
154
+ tags=metadata.get("tags", []),
155
+ summary=metadata.get("summary"),
156
+ category=metadata.get("category"),
157
+ language=metadata.get("language")
158
+ )
159
+
160
+ documents.append(document)
161
+
162
+ except Exception as e:
163
+ logger.warning(f"Error loading document metadata from {metadata_file}: {str(e)}")
164
+ continue
165
+
166
+ return documents
167
+
168
+ except Exception as e:
169
+ logger.error(f"Error listing documents: {str(e)}")
170
+ return []
171
+
172
+ def _apply_filters(self, metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
173
+ """Apply filters to document metadata"""
174
+ try:
175
+ for key, value in filters.items():
176
+ if key == "doc_type":
177
+ if metadata.get("doc_type") != value:
178
+ return False
179
+ elif key == "filename_contains":
180
+ if value.lower() not in metadata.get("filename", "").lower():
181
+ return False
182
+ elif key == "created_after":
183
+ doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
184
+ if doc_date < value:
185
+ return False
186
+ elif key == "created_before":
187
+ doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
188
+ if doc_date > value:
189
+ return False
190
+ elif key == "tags":
191
+ doc_tags = set(metadata.get("tags", []))
192
+ required_tags = set(value) if isinstance(value, list) else {value}
193
+ if not required_tags.intersection(doc_tags):
194
+ return False
195
+ elif key == "category":
196
+ if metadata.get("category") != value:
197
+ return False
198
+ elif key == "language":
199
+ if metadata.get("language") != value:
200
+ return False
201
+
202
+ return True
203
+ except Exception as e:
204
+ logger.error(f"Error applying filters: {str(e)}")
205
+ return True
206
+
207
+ async def update_document_metadata(self, document_id: str, updates: Dict[str, Any]) -> bool:
208
+ """Update document metadata"""
209
+ try:
210
+ metadata_file = self.metadata_path / f"{document_id}.json"
211
+
212
+ if not metadata_file.exists():
213
+ logger.warning(f"Document {document_id} not found")
214
+ return False
215
+
216
+ # Load existing metadata
217
+ with open(metadata_file, 'r', encoding='utf-8') as f:
218
+ metadata = json.load(f)
219
+
220
+ # Apply updates
221
+ for key, value in updates.items():
222
+ if key in ["tags", "summary", "category", "language", "metadata"]:
223
+ metadata[key] = value
224
+
225
+ # Save updated metadata
226
+ with open(metadata_file, 'w', encoding='utf-8') as f:
227
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
228
+
229
+ # Update cache if document is cached
230
+ if document_id in self._cache:
231
+ document = self._cache[document_id]
232
+ for key, value in updates.items():
233
+ if hasattr(document, key):
234
+ setattr(document, key, value)
235
+
236
+ logger.info(f"Updated metadata for document {document_id}")
237
+ return True
238
+
239
+ except Exception as e:
240
+ logger.error(f"Error updating document metadata: {str(e)}")
241
+ return False
242
+
243
+ async def delete_document(self, document_id: str) -> bool:
244
+ """Delete a document and its metadata"""
245
+ try:
246
+ metadata_file = self.metadata_path / f"{document_id}.json"
247
+ content_file = self.content_path / f"{document_id}.txt"
248
+
249
+ # Remove files
250
+ if metadata_file.exists():
251
+ metadata_file.unlink()
252
+ if content_file.exists():
253
+ content_file.unlink()
254
+
255
+ # Remove from cache
256
+ if document_id in self._cache:
257
+ del self._cache[document_id]
258
+
259
+ logger.info(f"Deleted document {document_id}")
260
+ return True
261
+
262
+ except Exception as e:
263
+ logger.error(f"Error deleting document {document_id}: {str(e)}")
264
+ return False
265
+
266
+ async def search_documents(self, query: str, fields: List[str] = None) -> List[Document]:
267
+ """Simple text search across documents"""
268
+ if not fields:
269
+ fields = ["filename", "content", "tags", "summary"]
270
+
271
+ try:
272
+ matching_documents = []
273
+ query_lower = query.lower()
274
+
275
+ # Get all documents
276
+ all_documents = await self.list_documents(limit=1000) # Adjust limit as needed
277
+
278
+ for document in all_documents:
279
+ match_found = False
280
+
281
+ for field in fields:
282
+ field_value = getattr(document, field, "")
283
+ if isinstance(field_value, list):
284
+ field_value = " ".join(field_value)
285
+ elif field_value is None:
286
+ field_value = ""
287
+
288
+ if query_lower in str(field_value).lower():
289
+ match_found = True
290
+ break
291
+
292
+ if match_found:
293
+ matching_documents.append(document)
294
+
295
+ logger.info(f"Found {len(matching_documents)} documents matching '{query}'")
296
+ return matching_documents
297
+
298
+ except Exception as e:
299
+ logger.error(f"Error searching documents: {str(e)}")
300
+ return []
301
+
302
+ def _add_to_cache(self, document_id: str, document: Document):
303
+ """Add document to cache with size limit"""
304
+ try:
305
+ # Remove oldest items if cache is full
306
+ if len(self._cache) >= self._cache_size_limit:
307
+ # Remove first item (FIFO)
308
+ oldest_key = next(iter(self._cache))
309
+ del self._cache[oldest_key]
310
+
311
+ self._cache[document_id] = document
312
+ except Exception as e:
313
+ logger.error(f"Error adding to cache: {str(e)}")
314
+
315
+ async def get_stats(self) -> Dict[str, Any]:
316
+ """Get statistics about the document store"""
317
+ try:
318
+ metadata_files = list(self.metadata_path.glob("*.json"))
319
+ content_files = list(self.content_path.glob("*.txt"))
320
+
321
+ # Calculate total storage size
322
+ total_size = 0
323
+ for file_path in metadata_files + content_files:
324
+ total_size += file_path.stat().st_size
325
+
326
+ # Count by document type
327
+ type_counts = {}
328
+ for metadata_file in metadata_files:
329
+ try:
330
+ with open(metadata_file, 'r') as f:
331
+ metadata = json.load(f)
332
+ doc_type = metadata.get("doc_type", "unknown")
333
+ type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
334
+ except:
335
+ continue
336
+
337
+ return {
338
+ "total_documents": len(metadata_files),
339
+ "total_size_bytes": total_size,
340
+ "total_size_mb": round(total_size / (1024 * 1024), 2),
341
+ "cache_size": len(self._cache),
342
+ "document_types": type_counts,
343
+ "storage_path": str(self.store_path),
344
+ "metadata_files": len(metadata_files),
345
+ "content_files": len(content_files)
346
+ }
347
+ except Exception as e:
348
+ logger.error(f"Error getting document store stats: {str(e)}")
349
+ return {"error": str(e)}
services/elevenlabs_service.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from typing import Optional, Dict, Any, List
4
+ import json
5
+
6
+ try:
7
+ from elevenlabs.client import ElevenLabs
8
+ from elevenlabs.conversational_ai.conversation import Conversation, ClientTools
9
+ from elevenlabs.conversational_ai.default_audio_interface import DefaultAudioInterface
10
+ ELEVENLABS_AVAILABLE = True
11
+ except ImportError:
12
+ ELEVENLABS_AVAILABLE = False
13
+ logger = logging.getLogger(__name__)
14
+ logger.warning("ElevenLabs SDK not available. Install: pip install elevenlabs")
15
+
16
+ import config
17
+ from services.llamaindex_service import LlamaIndexService
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class ElevenLabsService:
22
+ """
23
+ Enhanced service for ElevenLabs Conversational AI with proper RAG integration.
24
+
25
+ Key improvements:
26
+ - Proper client tools registration with event loop handling
27
+ - Built-in RAG through ElevenLabs Knowledge Base
28
+ - Support for both real-time voice and text-based chat
29
+ - Session management and conversation history
30
+ """
31
+
32
+ def __init__(self, llamaindex_service: LlamaIndexService):
33
+ """
34
+ Initialize ElevenLabs service with RAG integration
35
+
36
+ Args:
37
+ llamaindex_service: LlamaIndex service for document queries
38
+ """
39
+ self.config = config.config
40
+ self.llamaindex_service = llamaindex_service
41
+ self.client = None
42
+ self.client_tools = None
43
+ self.active_conversations: Dict[str, Conversation] = {}
44
+ self.conversation_history: Dict[str, List[Dict]] = {}
45
+
46
+ if not ELEVENLABS_AVAILABLE:
47
+ logger.error("ElevenLabs SDK not installed. Run: pip install elevenlabs")
48
+ return
49
+
50
+ if not self.config.ELEVENLABS_API_KEY:
51
+ logger.warning("ELEVENLABS_API_KEY not configured.")
52
+ return
53
+
54
+ try:
55
+ # Initialize ElevenLabs client
56
+ self.client = ElevenLabs(api_key=self.config.ELEVENLABS_API_KEY)
57
+ logger.info("ElevenLabs client initialized successfully")
58
+
59
+ # Initialize client tools - CRITICAL: Must be done in async context
60
+ self._init_client_tools()
61
+
62
+ logger.info("ElevenLabs service initialized")
63
+
64
+ except Exception as e:
65
+ logger.error(f"Error initializing ElevenLabs service: {str(e)}")
66
+
67
+ def _init_client_tools(self):
68
+ """Initialize client tools for RAG integration"""
69
+ try:
70
+ # FIX: Try initializing without arguments first (Newer SDKs)
71
+ try:
72
+ self.client_tools = ClientTools()
73
+ except TypeError:
74
+ # Fallback for older SDKs that might require a loop
75
+ try:
76
+ loop = asyncio.get_event_loop()
77
+ except RuntimeError:
78
+ loop = asyncio.new_event_loop()
79
+ asyncio.set_event_loop(loop)
80
+ self.client_tools = ClientTools(loop=loop)
81
+
82
+ # Register RAG query tool with proper metadata
83
+ self.client_tools.register(
84
+ "query_documents",
85
+ handler=self._rag_query_handler,
86
+ description="Search through the user's uploaded documents. Use this tool whenever the user asks questions about their documents, files, or content in their library.",
87
+ parameters={
88
+ "query": {
89
+ "type": "string",
90
+ "description": "The search query or question to find information in the documents"
91
+ }
92
+ },
93
+ is_async=True
94
+ )
95
+
96
+ logger.info("Client tools registered: query_documents")
97
+
98
+ except Exception as e:
99
+ logger.error(f"Error initializing client tools: {str(e)}")
100
+ # Keep client_tools as None so we know it failed
101
+ self.client_tools = None
102
+
103
+ async def _rag_query_handler(self, params: Dict[str, Any]) -> Dict[str, Any]:
104
+ """
105
+ Enhanced RAG query handler with better error handling and response formatting
106
+
107
+ This tool is called by the ElevenLabs agent when it needs to search documents.
108
+
109
+ Args:
110
+ params: Dictionary with 'query' key containing user's question
111
+
112
+ Returns:
113
+ Dictionary with 'answer' and optional 'sources'
114
+ """
115
+ try:
116
+ query = params.get("query", "")
117
+
118
+ if not query or not query.strip():
119
+ return {
120
+ "answer": "I didn't receive a question to search for. Could you please ask again?"
121
+ }
122
+
123
+ logger.info(f"RAG query: {query}")
124
+
125
+ # Query LlamaIndex with timeout
126
+ try:
127
+ result = await asyncio.wait_for(
128
+ self.llamaindex_service.query(query),
129
+ timeout=self.config.CONVERSATION_TIMEOUT if hasattr(self.config, 'CONVERSATION_TIMEOUT') else 30
130
+ )
131
+
132
+ logger.info(f"RAG query successful: {len(result)} chars")
133
+
134
+ # Format response for conversational voice
135
+ return {
136
+ "answer": result,
137
+ "confidence": "high",
138
+ "source": "document_library"
139
+ }
140
+
141
+ except asyncio.TimeoutError:
142
+ logger.error("RAG query timeout")
143
+ return {
144
+ "answer": "The search is taking longer than expected. Could you try rephrasing your question?"
145
+ }
146
+
147
+ except Exception as e:
148
+ logger.error(f"RAG query error: {str(e)}", exc_info=True)
149
+ return {
150
+ "answer": f"I encountered an error while searching: {str(e)}. Please try again."
151
+ }
152
+
153
+ def create_conversation(
154
+ self,
155
+ agent_id: Optional[str] = None,
156
+ session_id: Optional[str] = None,
157
+ use_audio: bool = True
158
+ ) -> Optional[Conversation]:
159
+ """
160
+ Create a new conversation session
161
+
162
+ Args:
163
+ agent_id: ElevenLabs agent ID (uses config default if not provided)
164
+ session_id: Optional session ID for tracking
165
+ use_audio: If True, use audio interface; if False, text-only mode
166
+
167
+ Returns:
168
+ Conversation object or None if initialization fails
169
+ """
170
+ if not self.client:
171
+ logger.error("ElevenLabs client not initialized")
172
+ return None
173
+
174
+ try:
175
+ agent_id = agent_id or self.config.ELEVENLABS_AGENT_ID
176
+
177
+ if not agent_id:
178
+ logger.error("No agent ID provided or configured")
179
+ return None
180
+
181
+ # Create audio interface only if requested
182
+ audio_interface = DefaultAudioInterface() if use_audio else None
183
+
184
+ # Create conversation with RAG tool
185
+ conversation = Conversation(
186
+ client=self.client,
187
+ agent_id=agent_id,
188
+ requires_auth=True,
189
+ audio_interface=audio_interface,
190
+ client_tools=self.client_tools,
191
+ # Add callbacks for monitoring
192
+ callback_agent_response=lambda response: self._on_agent_response(session_id, response),
193
+ callback_user_transcript=lambda transcript: self._on_user_message(session_id, transcript)
194
+ )
195
+
196
+ # Store conversation and initialize history
197
+ if session_id:
198
+ self.active_conversations[session_id] = conversation
199
+ self.conversation_history[session_id] = []
200
+
201
+ logger.info(f"Created conversation for agent: {agent_id}")
202
+ return conversation
203
+
204
+ except Exception as e:
205
+ logger.error(f"Error creating conversation: {str(e)}")
206
+ return None
207
+
208
+ def _on_agent_response(self, session_id: Optional[str], response: str):
209
+ """Track agent responses"""
210
+ if session_id and session_id in self.conversation_history:
211
+ self.conversation_history[session_id].append({
212
+ "role": "assistant",
213
+ "content": response
214
+ })
215
+ logger.debug(f"Agent response: {response[:100]}...")
216
+
217
+ def _on_user_message(self, session_id: Optional[str], message: str):
218
+ """Track user messages"""
219
+ if session_id and session_id in self.conversation_history:
220
+ self.conversation_history[session_id].append({
221
+ "role": "user",
222
+ "content": message
223
+ })
224
+ logger.debug(f"User message: {message[:100]}...")
225
+
226
+ async def start_conversation(self, session_id: Optional[str] = None) -> Dict[str, Any]:
227
+ """
228
+ Start a new conversation session
229
+
230
+ Args:
231
+ session_id: Optional session ID for tracking
232
+
233
+ Returns:
234
+ Dictionary with success status and conversation info
235
+ """
236
+ try:
237
+ conversation = self.create_conversation(session_id=session_id, use_audio=False)
238
+
239
+ if conversation:
240
+ return {
241
+ "success": True,
242
+ "session_id": session_id,
243
+ "message": "Voice assistant ready. Ask me anything about your documents!"
244
+ }
245
+ else:
246
+ return {
247
+ "success": False,
248
+ "error": "Failed to create conversation. Check API configuration."
249
+ }
250
+ except Exception as e:
251
+ logger.error(f"Error starting conversation: {str(e)}")
252
+ return {
253
+ "success": False,
254
+ "error": str(e)
255
+ }
256
+
257
+ async def send_text_message(
258
+ self,
259
+ message: str,
260
+ session_id: str
261
+ ) -> Dict[str, Any]:
262
+ """
263
+ Send a text message to the agent and get response
264
+
265
+ This is for text-based chat (no audio). Perfect for web interfaces.
266
+
267
+ Args:
268
+ message: User's text message
269
+ session_id: Session identifier
270
+
271
+ Returns:
272
+ Dictionary with agent's response
273
+ """
274
+ try:
275
+ if not message or not message.strip():
276
+ return {
277
+ "success": False,
278
+ "error": "Empty message"
279
+ }
280
+
281
+ # For text-based interaction, we directly query the RAG system
282
+ # since ElevenLabs Conversational AI is primarily audio-focused
283
+
284
+ # Store user message
285
+ if session_id in self.conversation_history:
286
+ self.conversation_history[session_id].append({
287
+ "role": "user",
288
+ "content": message
289
+ })
290
+
291
+ # Query RAG system
292
+ response = await self._rag_query_handler({"query": message})
293
+
294
+ # Store assistant response
295
+ if session_id in self.conversation_history:
296
+ self.conversation_history[session_id].append({
297
+ "role": "assistant",
298
+ "content": response["answer"]
299
+ })
300
+
301
+ return {
302
+ "success": True,
303
+ "answer": response["answer"],
304
+ "session_id": session_id
305
+ }
306
+
307
+ except Exception as e:
308
+ logger.error(f"Error sending message: {str(e)}")
309
+ return {
310
+ "success": False,
311
+ "error": str(e)
312
+ }
313
+
314
+ async def end_conversation(self, session_id: str) -> bool:
315
+ """
316
+ End an active conversation session
317
+
318
+ Args:
319
+ session_id: Session identifier
320
+
321
+ Returns:
322
+ True if conversation ended successfully
323
+ """
324
+ try:
325
+ if session_id in self.active_conversations:
326
+ conversation = self.active_conversations[session_id]
327
+
328
+ # Try to end the session gracefully
329
+ try:
330
+ if hasattr(conversation, 'end_session'):
331
+ conversation.end_session()
332
+ except Exception as e:
333
+ logger.warning(f"Error during session cleanup: {str(e)}")
334
+
335
+ # Remove from active conversations
336
+ del self.active_conversations[session_id]
337
+ logger.info(f"Ended conversation: {session_id}")
338
+ return True
339
+
340
+ return False
341
+
342
+ except Exception as e:
343
+ logger.error(f"Error ending conversation: {str(e)}")
344
+ return False
345
+
346
+ def get_conversation_history(self, session_id: str) -> List[Dict]:
347
+ """Get conversation history for a session"""
348
+ return self.conversation_history.get(session_id, [])
349
+
350
+ def get_available_voices(self) -> List[Dict[str, str]]:
351
+ """
352
+ Get list of available voice models
353
+
354
+ Returns:
355
+ List of voice model information
356
+ """
357
+ try:
358
+ if not self.client:
359
+ return []
360
+
361
+ voices = self.client.voices.get_all()
362
+
363
+ return [
364
+ {
365
+ "voice_id": voice.voice_id,
366
+ "name": voice.name,
367
+ "category": getattr(voice, 'category', "general")
368
+ }
369
+ for voice in voices.voices
370
+ ]
371
+
372
+ except Exception as e:
373
+ logger.error(f"Error getting voices: {str(e)}")
374
+ return []
375
+
376
+ def is_available(self) -> bool:
377
+ """Check if ElevenLabs service is available and configured"""
378
+ return ELEVENLABS_AVAILABLE and self.client is not None
379
+
380
+ async def test_connection(self) -> Dict[str, Any]:
381
+ """
382
+ Test ElevenLabs API connection
383
+
384
+ Returns:
385
+ Dictionary with test results
386
+ """
387
+ try:
388
+ if not self.client:
389
+ return {
390
+ "status": "error",
391
+ "message": "Client not initialized"
392
+ }
393
+
394
+ # Test API by fetching voices
395
+ voices = self.get_available_voices()
396
+
397
+ # Test RAG tool
398
+ test_result = await self._rag_query_handler({"query": "test"})
399
+
400
+ return {
401
+ "status": "success",
402
+ "message": "ElevenLabs API connected",
403
+ "voices_available": len(voices),
404
+ "rag_tool_working": "answer" in test_result,
405
+ "client_tools_registered": self.client_tools is not None
406
+ }
407
+
408
+ except Exception as e:
409
+ logger.error(f"Connection test failed: {str(e)}")
410
+ return {
411
+ "status": "error",
412
+ "message": str(e)
413
+ }
services/embedding_service.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from typing import List, Optional, Dict, Any
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ import torch
7
+ import openai
8
+ import config
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class EmbeddingService:
13
+ def __init__(self):
14
+ self.config = config.config
15
+ self.model_name = self.config.EMBEDDING_MODEL
16
+ self.model = None
17
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
+ self.openai_client = None
19
+ self.is_openai_model = False
20
+
21
+ # Initialize OpenAI client if needed
22
+ if self.config.OPENAI_API_KEY:
23
+ self.openai_client = openai.OpenAI(api_key=self.config.OPENAI_API_KEY)
24
+
25
+ # Load model lazily
26
+ self._load_model()
27
+
28
+ def _load_model(self):
29
+ """Load the embedding model"""
30
+ try:
31
+ logger.info(f"Loading embedding model: {self.model_name}")
32
+
33
+ if self.model_name.startswith("text-embedding-"):
34
+ if not self.openai_client:
35
+ logger.warning(f"OpenAI model {self.model_name} requested but OPENAI_API_KEY not found. Falling back to local model.")
36
+ self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
37
+ self.is_openai_model = False
38
+ self.model = SentenceTransformer(self.model_name, device=self.device)
39
+ else:
40
+ self.is_openai_model = True
41
+ logger.info(f"Using OpenAI embedding model: {self.model_name}")
42
+ else:
43
+ self.is_openai_model = False
44
+ self.model = SentenceTransformer(self.model_name, device=self.device)
45
+ logger.info(f"Local embedding model loaded successfully on {self.device}")
46
+
47
+ except Exception as e:
48
+ logger.error(f"Failed to load embedding model: {str(e)}")
49
+ # Fallback to a smaller model
50
+ try:
51
+ self.model_name = "all-MiniLM-L6-v2"
52
+ self.is_openai_model = False
53
+ self.model = SentenceTransformer(self.model_name, device=self.device)
54
+ logger.info(f"Loaded fallback embedding model: {self.model_name}")
55
+ except Exception as fallback_error:
56
+ logger.error(f"Failed to load fallback model: {str(fallback_error)}")
57
+ raise
58
+
59
+ async def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
60
+ """Generate embeddings for a list of texts"""
61
+ if not texts:
62
+ return []
63
+
64
+ if not self.is_openai_model and self.model is None:
65
+ raise RuntimeError("Embedding model not loaded")
66
+
67
+ try:
68
+ # Filter out empty texts
69
+ non_empty_texts = [text for text in texts if text and text.strip()]
70
+ if not non_empty_texts:
71
+ logger.warning("No non-empty texts provided for embedding")
72
+ return []
73
+
74
+ logger.info(f"Generating embeddings for {len(non_empty_texts)} texts using {self.model_name}")
75
+
76
+ # Process in batches to manage memory/API limits
77
+ all_embeddings = []
78
+ for i in range(0, len(non_empty_texts), batch_size):
79
+ batch = non_empty_texts[i:i + batch_size]
80
+
81
+ # Run embedding generation in thread pool to avoid blocking
82
+ loop = asyncio.get_event_loop()
83
+ batch_embeddings = await loop.run_in_executor(
84
+ None,
85
+ self._generate_batch_embeddings,
86
+ batch
87
+ )
88
+ all_embeddings.extend(batch_embeddings)
89
+
90
+ logger.info(f"Generated {len(all_embeddings)} embeddings")
91
+ return all_embeddings
92
+
93
+ except Exception as e:
94
+ logger.error(f"Error generating embeddings: {str(e)}")
95
+ raise
96
+
97
+ def _generate_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
98
+ """Generate embeddings for a batch of texts (synchronous)"""
99
+ try:
100
+ if self.is_openai_model:
101
+ # OpenAI Embeddings
102
+ response = self.openai_client.embeddings.create(
103
+ input=texts,
104
+ model=self.model_name
105
+ )
106
+ return [data.embedding for data in response.data]
107
+ else:
108
+ # Local SentenceTransformer
109
+ embeddings = self.model.encode(
110
+ texts,
111
+ convert_to_numpy=True,
112
+ normalize_embeddings=True,
113
+ batch_size=len(texts)
114
+ )
115
+ return embeddings.tolist()
116
+ except Exception as e:
117
+ logger.error(f"Error in batch embedding generation: {str(e)}")
118
+ raise
119
+
120
+ async def generate_single_embedding(self, text: str) -> Optional[List[float]]:
121
+ """Generate embedding for a single text"""
122
+ if not text or not text.strip():
123
+ return None
124
+
125
+ try:
126
+ embeddings = await self.generate_embeddings([text])
127
+ return embeddings[0] if embeddings else None
128
+ except Exception as e:
129
+ logger.error(f"Error generating single embedding: {str(e)}")
130
+ return None
131
+
132
+ def get_embedding_dimension(self) -> int:
133
+ """Get the dimension of embeddings produced by the model"""
134
+ if self.is_openai_model:
135
+ if "small" in self.model_name:
136
+ return 1536
137
+ elif "large" in self.model_name:
138
+ return 3072
139
+ elif "ada" in self.model_name:
140
+ return 1536
141
+ else:
142
+ # Default fallback or make a call to check?
143
+ # For now assume 1536 as it's standard for recent OpenAI models
144
+ return 1536
145
+
146
+ if self.model is None:
147
+ raise RuntimeError("Embedding model not loaded")
148
+
149
+ return self.model.get_sentence_embedding_dimension()
150
+
151
+ def compute_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
152
+ """Compute cosine similarity between two embeddings"""
153
+ try:
154
+ # Convert to numpy arrays
155
+ emb1 = np.array(embedding1)
156
+ emb2 = np.array(embedding2)
157
+
158
+ # Compute cosine similarity
159
+ similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
160
+
161
+ return float(similarity)
162
+ except Exception as e:
163
+ logger.error(f"Error computing similarity: {str(e)}")
164
+ return 0.0
165
+
166
+ def compute_similarities(self, query_embedding: List[float], embeddings: List[List[float]]) -> List[float]:
167
+ """Compute similarities between a query embedding and multiple embeddings"""
168
+ try:
169
+ query_emb = np.array(query_embedding)
170
+ emb_matrix = np.array(embeddings)
171
+
172
+ # Compute cosine similarities
173
+ similarities = np.dot(emb_matrix, query_emb) / (
174
+ np.linalg.norm(emb_matrix, axis=1) * np.linalg.norm(query_emb)
175
+ )
176
+
177
+ return similarities.tolist()
178
+ except Exception as e:
179
+ logger.error(f"Error computing similarities: {str(e)}")
180
+ return [0.0] * len(embeddings)
181
+
182
+ async def embed_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
183
+ """Embed a list of chunks and add embeddings to them"""
184
+ if not chunks:
185
+ return []
186
+
187
+ try:
188
+ # Extract texts
189
+ texts = [chunk.get('content', '') for chunk in chunks]
190
+
191
+ # Generate embeddings
192
+ embeddings = await self.generate_embeddings(texts)
193
+
194
+ # Add embeddings to chunks
195
+ embedded_chunks = []
196
+ for i, chunk in enumerate(chunks):
197
+ if i < len(embeddings):
198
+ chunk_copy = chunk.copy()
199
+ chunk_copy['embedding'] = embeddings[i]
200
+ embedded_chunks.append(chunk_copy)
201
+ else:
202
+ logger.warning(f"No embedding generated for chunk {i}")
203
+ embedded_chunks.append(chunk)
204
+
205
+ return embedded_chunks
206
+ except Exception as e:
207
+ logger.error(f"Error embedding chunks: {str(e)}")
208
+ raise
209
+
210
+ def validate_embedding(self, embedding: List[float]) -> bool:
211
+ """Validate that an embedding is properly formatted"""
212
+ try:
213
+ if not embedding:
214
+ return False
215
+
216
+ if not isinstance(embedding, list):
217
+ return False
218
+
219
+ if len(embedding) != self.get_embedding_dimension():
220
+ return False
221
+
222
+ # Check for NaN or infinite values
223
+ emb_array = np.array(embedding)
224
+ if np.isnan(emb_array).any() or np.isinf(emb_array).any():
225
+ return False
226
+
227
+ return True
228
+ except Exception:
229
+ return False
230
+
231
+ async def get_model_info(self) -> Dict[str, Any]:
232
+ """Get information about the loaded model"""
233
+ try:
234
+ return {
235
+ "model_name": self.model_name,
236
+ "device": "openai-api" if self.is_openai_model else self.device,
237
+ "embedding_dimension": self.get_embedding_dimension(),
238
+ "max_sequence_length": "8191" if self.is_openai_model else getattr(self.model, 'max_seq_length', 'unknown'),
239
+ "model_loaded": self.is_openai_model or (self.model is not None)
240
+ }
241
+ except Exception as e:
242
+ logger.error(f"Error getting model info: {str(e)}")
243
+ return {"error": str(e)}
services/llamaindex_service.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Optional, Any
4
+ from pathlib import Path
5
+ import shutil
6
+ import asyncio
7
+
8
+ from llama_index.core import (
9
+ VectorStoreIndex,
10
+ Document,
11
+ StorageContext,
12
+ load_index_from_storage,
13
+ Settings,
14
+ SummaryIndex
15
+ )
16
+ from llama_index.core.tools import QueryEngineTool, ToolMetadata
17
+ from llama_index.core.agent import ReActAgent
18
+ from llama_index.llms.openai import OpenAI
19
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
20
+ from llama_index.embeddings.openai import OpenAIEmbedding
21
+
22
+ import config
23
+ from services.document_store_service import DocumentStoreService
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ class LlamaIndexService:
28
+ def __init__(self, document_store: DocumentStoreService):
29
+ self.document_store = document_store
30
+ self.config = config.config
31
+ self.storage_dir = Path(self.config.DATA_DIR) / "llamaindex_storage"
32
+ self.index = None
33
+ self.agent = None
34
+ self.is_initialized = False
35
+
36
+ self._initialize_settings()
37
+ # Attempt to load existing index, but don't fail if empty
38
+ self._try_load_from_storage()
39
+
40
+ def _initialize_settings(self):
41
+ """Initialize LlamaIndex settings (LLM, Embeddings)"""
42
+ try:
43
+ # LLM Setup
44
+ if self.config.OPENAI_API_KEY:
45
+ Settings.llm = OpenAI(model=self.config.OPENAI_MODEL, api_key=self.config.OPENAI_API_KEY)
46
+ logger.info(f"LlamaIndex using OpenAI model: {self.config.OPENAI_MODEL}")
47
+ elif self.config.NEBIUS_API_KEY:
48
+ Settings.llm = OpenAI(
49
+ model=self.config.NEBIUS_MODEL,
50
+ api_key=self.config.NEBIUS_API_KEY,
51
+ api_base=self.config.NEBIUS_BASE_URL
52
+ )
53
+ logger.info(f"LlamaIndex using Nebius model: {self.config.NEBIUS_MODEL}")
54
+ else:
55
+ logger.warning("No API key found for LlamaIndex LLM. Agentic features may fail.")
56
+
57
+ # Embedding Setup
58
+ if self.config.EMBEDDING_MODEL.startswith("text-embedding-"):
59
+ if self.config.OPENAI_API_KEY:
60
+ Settings.embed_model = OpenAIEmbedding(
61
+ model=self.config.EMBEDDING_MODEL,
62
+ api_key=self.config.OPENAI_API_KEY
63
+ )
64
+ else:
65
+ Settings.embed_model = HuggingFaceEmbedding(
66
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
67
+ )
68
+ else:
69
+ Settings.embed_model = HuggingFaceEmbedding(
70
+ model_name=self.config.EMBEDDING_MODEL
71
+ )
72
+
73
+ except Exception as e:
74
+ logger.error(f"Error initializing LlamaIndex settings: {str(e)}")
75
+
76
+ def _try_load_from_storage(self):
77
+ """Try to load index from storage synchronously"""
78
+ try:
79
+ if self.storage_dir.exists() and any(self.storage_dir.iterdir()):
80
+ logger.info("Loading LlamaIndex from storage...")
81
+ storage_context = StorageContext.from_defaults(persist_dir=str(self.storage_dir))
82
+ self.index = load_index_from_storage(storage_context)
83
+ self._initialize_agent()
84
+ self.is_initialized = True
85
+ else:
86
+ logger.info("No existing LlamaIndex storage found. Waiting for initialization.")
87
+ except Exception as e:
88
+ logger.error(f"Error loading LlamaIndex from storage: {str(e)}")
89
+
90
+ async def initialize(self):
91
+ """Async initialization to sync documents and build index"""
92
+ try:
93
+ logger.info("Starting LlamaIndex async initialization...")
94
+ if self.index is None:
95
+ await self.sync_from_document_store()
96
+
97
+ self.is_initialized = True
98
+ logger.info("LlamaIndex async initialization complete.")
99
+ except Exception as e:
100
+ logger.error(f"Error during LlamaIndex async initialization: {str(e)}")
101
+
102
+ async def sync_from_document_store(self):
103
+ """Sync documents from DocumentStore to LlamaIndex"""
104
+ try:
105
+ logger.info("Syncing documents from DocumentStore to LlamaIndex...")
106
+
107
+ docs = await self.document_store.list_documents(limit=1000)
108
+
109
+ if not docs:
110
+ logger.warning("No documents found in DocumentStore. Creating empty index.")
111
+ # FIX: Handle empty state gracefully
112
+ self.index = None
113
+ self.agent = None
114
+ return
115
+
116
+ # Convert to LlamaIndex documents
117
+ llama_docs = []
118
+ for doc in docs:
119
+ if doc.content and len(doc.content.strip()) > 0:
120
+ llama_doc = Document(
121
+ text=doc.content,
122
+ metadata={
123
+ "filename": doc.filename,
124
+ "document_id": doc.id,
125
+ **doc.metadata
126
+ }
127
+ )
128
+ llama_docs.append(llama_doc)
129
+
130
+ if not llama_docs:
131
+ logger.warning("Documents found but content was empty.")
132
+ return
133
+
134
+ logger.info(f"Building LlamaIndex with {len(llama_docs)} documents...")
135
+ self.index = VectorStoreIndex.from_documents(llama_docs)
136
+
137
+ # Persist storage
138
+ if not self.storage_dir.exists():
139
+ self.storage_dir.mkdir(parents=True, exist_ok=True)
140
+ self.index.storage_context.persist(persist_dir=str(self.storage_dir))
141
+
142
+ # Re-initialize agent with new index
143
+ self._initialize_agent()
144
+ logger.info("LlamaIndex sync complete.")
145
+
146
+ except Exception as e:
147
+ logger.error(f"Error syncing LlamaIndex: {str(e)}")
148
+
149
+ async def sync_on_demand(self):
150
+ """Manual trigger for syncing documents"""
151
+ await self.sync_from_document_store()
152
+ return True
153
+
154
+ def _initialize_agent(self):
155
+ """Initialize the ReAct agent with query engine tools"""
156
+ try:
157
+ if not self.index:
158
+ return
159
+
160
+ query_engine = self.index.as_query_engine()
161
+
162
+ query_engine_tool = QueryEngineTool(
163
+ query_engine=query_engine,
164
+ metadata=ToolMetadata(
165
+ name="document_search",
166
+ description="Search and retrieve information from the document library. Use this for specific questions about content."
167
+ )
168
+ )
169
+
170
+ # ReAct Agent requires an LLM
171
+ self.agent = ReActAgent.from_tools(
172
+ [query_engine_tool],
173
+ llm=Settings.llm,
174
+ verbose=True
175
+ )
176
+ logger.info("LlamaIndex ReAct agent initialized")
177
+
178
+ except Exception as e:
179
+ logger.error(f"Error initializing LlamaIndex agent: {str(e)}")
180
+
181
+ async def query(self, query_text: str) -> str:
182
+ """Process a query using the agent"""
183
+
184
+ # 1. AUTO-RECOVERY: If agent is missing, try to initialize it now
185
+ if not self.agent:
186
+ logger.info("Agent not found during query. Attempting to initialize...")
187
+ await self.initialize()
188
+
189
+ # 2. Check if it's still missing after attempt
190
+ if not self.agent:
191
+ # Check why it failed
192
+ if not self.index:
193
+ return "I can't answer that yet because there are no documents in the library. Please upload a document first."
194
+
195
+ return "System Error: The AI agent failed to start. Please check if your OPENAI_API_KEY is correct in the .env file."
196
+
197
+ try:
198
+ # 3. Run the query
199
+ response = await self.agent.achat(query_text)
200
+ return str(response)
201
+ except Exception as e:
202
+ logger.error(f"Error querying LlamaIndex agent: {str(e)}")
203
+ return f"I encountered an error searching the documents: {str(e)}"
services/llm_service.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mistralai import Mistral
2
+ import logging
3
+ import asyncio
4
+ from typing import List, Dict, Any, Optional
5
+
6
+ import openai
7
+ import config
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class LLMService:
12
+ def __init__(self):
13
+ self.config = config.config
14
+
15
+ self.nebius_client = None
16
+ self.mistral_client = None
17
+ self.openai_client = None
18
+
19
+ self._initialize_clients()
20
+
21
+ def _initialize_clients(self):
22
+ """Initialize LLM clients"""
23
+ try:
24
+ if self.config.OPENAI_API_KEY:
25
+ self.openai_client = openai.OpenAI(
26
+ api_key=self.config.OPENAI_API_KEY
27
+ )
28
+ logger.info("OpenAI client initialized")
29
+
30
+ if self.config.NEBIUS_API_KEY:
31
+ self.nebius_client = openai.OpenAI(
32
+ api_key=self.config.NEBIUS_API_KEY,
33
+ base_url=self.config.NEBIUS_BASE_URL
34
+ )
35
+ logger.info("NEBIUS client initialized")
36
+
37
+ if self.config.MISTRAL_API_KEY:
38
+ self.mistral_client = Mistral( # Standard sync client
39
+ api_key=self.config.MISTRAL_API_KEY
40
+ )
41
+ logger.info("Mistral client initialized")
42
+
43
+ # Check if at least one client is initialized
44
+ if not any([self.openai_client, self.nebius_client, self.mistral_client]):
45
+ logger.warning("No LLM clients could be initialized based on current config. Check API keys.")
46
+ else:
47
+ logger.info("LLM clients initialized successfully (at least one).")
48
+
49
+ except Exception as e:
50
+ logger.error(f"Error initializing LLM clients: {str(e)}")
51
+ raise
52
+
53
+ async def generate_text(self, prompt: str, model: str = "auto", max_tokens: int = 1000, temperature: float = 0.7) -> str:
54
+ """Generate text using the specified model, with new priority for 'auto'."""
55
+ try:
56
+ selected_model_name_for_call: str = ""
57
+
58
+ if model == "auto":
59
+ # Priority: 1. NEBIUS (Llama 3.3 - Cost Effective), 2. OpenAI (GPT-5.1), 3. Mistral
60
+ if self.nebius_client and self.config.NEBIUS_MODEL:
61
+ selected_model_name_for_call = self.config.NEBIUS_MODEL
62
+ logger.debug(f"Auto-selected NEBIUS model: {selected_model_name_for_call}")
63
+ return await self._generate_with_nebius(prompt, selected_model_name_for_call, max_tokens, temperature)
64
+ elif self.openai_client and self.config.OPENAI_MODEL:
65
+ selected_model_name_for_call = self.config.OPENAI_MODEL
66
+ logger.debug(f"Auto-selected OpenAI model: {selected_model_name_for_call}")
67
+ return await self._generate_with_openai(prompt, selected_model_name_for_call, max_tokens, temperature)
68
+ elif self.mistral_client and self.config.MISTRAL_MODEL:
69
+ selected_model_name_for_call = self.config.MISTRAL_MODEL
70
+ logger.debug(f"Auto-selected Mistral model: {selected_model_name_for_call}")
71
+ return await self._generate_with_mistral(prompt, selected_model_name_for_call, max_tokens, temperature)
72
+ else:
73
+ logger.error("No LLM clients available for 'auto' mode or default models not configured.")
74
+ raise ValueError("No LLM clients available for 'auto' mode or default models not configured.")
75
+
76
+ elif model == "fast":
77
+ # Priority for speed: 1. OpenAI (GPT-5-mini), 2. Mistral Small, 3. Nebius
78
+ if self.openai_client and self.config.FAST_MODEL:
79
+ return await self._generate_with_openai(prompt, self.config.FAST_MODEL, max_tokens, temperature)
80
+ # Fallback to auto if fast model not available
81
+ return await self.generate_text(prompt, "auto", max_tokens, temperature)
82
+
83
+ elif model.startswith("gpt-") or model.startswith("openai/") or "o1-" in model or "o3-" in model:
84
+ if self.openai_client:
85
+ actual_model = model.split('/')[-1] if '/' in model else model
86
+ return await self._generate_with_openai(prompt, actual_model, max_tokens, temperature)
87
+ elif self.nebius_client and "gpt-oss" in model: # Handle Nebius "openai/" prefix if any
88
+ actual_model = model.split('/')[-1] if '/' in model else model
89
+ return await self._generate_with_nebius(prompt, actual_model, max_tokens, temperature)
90
+ else:
91
+ raise ValueError("OpenAI client not available. Check API key.")
92
+
93
+ elif model.lower().startswith("nebius/") or model.lower().startswith("meta-llama/"):
94
+ if not self.nebius_client:
95
+ raise ValueError("NEBIUS client not available. Check API key.")
96
+ return await self._generate_with_nebius(prompt, model, max_tokens, temperature)
97
+
98
+ elif model.startswith("mistral"):
99
+ if not self.mistral_client:
100
+ raise ValueError("Mistral client not available. Check API key or model prefix.")
101
+ return await self._generate_with_mistral(prompt, model, max_tokens, temperature)
102
+
103
+ else:
104
+ raise ValueError(f"Unsupported model: {model}. Must start with 'gpt-', 'openai/', 'nebius/', 'mistral', or be 'auto'.")
105
+
106
+ except Exception as e:
107
+ logger.error(f"Error generating text with model '{model}': {str(e)}")
108
+ raise
109
+
110
+ async def _generate_with_openai(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
111
+ """Generate text using OpenAI"""
112
+ if not self.openai_client:
113
+ raise RuntimeError("OpenAI client not initialized.")
114
+ try:
115
+ logger.debug(f"Generating with OpenAI model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}")
116
+ loop = asyncio.get_event_loop()
117
+
118
+ # Determine correct token parameter based on model family
119
+ # GPT-5, o1, o3 series use max_completion_tokens
120
+ use_completion_tokens = any(x in model_name for x in ["gpt-5", "o1-", "o3-"])
121
+
122
+ kwargs = {
123
+ "model": model_name,
124
+ "messages": [{"role": "user", "content": prompt}],
125
+ }
126
+
127
+ if use_completion_tokens:
128
+ kwargs["max_completion_tokens"] = max_tokens
129
+ # Reasoning models enforce temperature=1
130
+ kwargs["temperature"] = 1
131
+ if temperature != 1:
132
+ logger.warning(f"Temperature {temperature} ignored for model {model_name} (requires 1).")
133
+ else:
134
+ kwargs["max_tokens"] = max_tokens
135
+ kwargs["temperature"] = temperature
136
+
137
+ response = await loop.run_in_executor(
138
+ None,
139
+ lambda: self.openai_client.chat.completions.create(**kwargs)
140
+ )
141
+ if response.choices and response.choices[0].message:
142
+ content = response.choices[0].message.content
143
+ if content is not None:
144
+ return content.strip()
145
+ return ""
146
+ except Exception as e:
147
+ logger.error(f"Error with OpenAI generation (model: {model_name}): {str(e)}")
148
+ raise
149
+
150
+ async def _generate_with_nebius(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
151
+ """Generate text using NEBIUS (OpenAI OSS models via sync client)"""
152
+ if not self.nebius_client:
153
+ raise RuntimeError("NEBIUS client not initialized.")
154
+ try:
155
+ logger.debug(f"Generating with NEBIUS model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}, prompt: '{prompt[:50]}...'")
156
+ loop = asyncio.get_event_loop()
157
+
158
+ response = await loop.run_in_executor(
159
+ None,
160
+ lambda: self.nebius_client.chat.completions.create(
161
+ model=model_name,
162
+ messages=[{"role": "user", "content": prompt}],
163
+ max_tokens=max_tokens,
164
+ temperature=temperature
165
+ )
166
+ )
167
+ if response.choices and response.choices[0].message:
168
+ content = response.choices[0].message.content
169
+ if content is not None:
170
+ return content.strip()
171
+ else:
172
+ logger.warning(f"NEBIUS response message content is None for model {model_name}.")
173
+ return ""
174
+ else:
175
+ logger.warning(f"NEBIUS response did not contain expected choices or message for model {model_name}.")
176
+ return ""
177
+ except Exception as e:
178
+ logger.error(f"Error with NEBIUS generation (model: {model_name}): {str(e)}")
179
+ raise
180
+
181
+ async def _generate_with_mistral(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
182
+ """Generate text using Mistral (Sync via run_in_executor)"""
183
+ if not self.mistral_client:
184
+ raise RuntimeError("Mistral client not initialized.")
185
+ try:
186
+ logger.debug(f"Generating with Mistral model: {model_name}, temp: {temperature}, prompt: '{prompt[:50]}...' (max_tokens: {max_tokens} - note: not directly used by MistralClient.chat)")
187
+ loop = asyncio.get_event_loop()
188
+
189
+ response = await loop.run_in_executor(
190
+ None,
191
+ lambda: self.mistral_client.chat(
192
+ model=model_name,
193
+ messages=[{"role": "user", "content": prompt}],
194
+ max_tokens=max_tokens,
195
+ temperature=temperature
196
+ )
197
+ )
198
+ if response.choices and response.choices[0].message:
199
+ content = response.choices[0].message.content
200
+ if content is not None:
201
+ return content.strip()
202
+ else:
203
+ logger.warning(f"Mistral response message content is None for model {model_name}.")
204
+ return ""
205
+ else:
206
+ logger.warning(f"Mistral response did not contain expected choices or message for model {model_name}.")
207
+ return ""
208
+ except Exception as e:
209
+ logger.error(f"Error with Mistral generation (model: {model_name}): {str(e)}")
210
+ raise
211
+
212
+
213
+ async def summarize(self, text: str, style: str = "concise", max_length: Optional[int] = None) -> str:
214
+ if not text.strip():
215
+ return ""
216
+
217
+ style_prompts = {
218
+ "concise": "Provide a concise summary of the following text, focusing on the main points:",
219
+ "detailed": "Provide a detailed summary of the following text, including key details and supporting information:",
220
+ "bullet_points": "Summarize the following text as a list of bullet points highlighting the main ideas:",
221
+ "executive": "Provide an executive summary of the following text, focusing on key findings and actionable insights:"
222
+ }
223
+ prompt_template = style_prompts.get(style, style_prompts["concise"])
224
+ if max_length:
225
+ prompt_template += f" Keep the summary under approximately {max_length} words."
226
+
227
+ prompt = f"{prompt_template}\n\nText to summarize:\n{text}\n\nSummary:"
228
+
229
+ try:
230
+ summary_max_tokens = (max_length * 2) if max_length else 500
231
+ summary = await self.generate_text(prompt, model="auto", max_tokens=summary_max_tokens, temperature=0.3)
232
+ return summary.strip()
233
+ except Exception as e:
234
+ logger.error(f"Error generating summary: {str(e)}")
235
+ return "Error generating summary"
236
+
237
+ async def generate_tags(self, text: str, max_tags: int = 5) -> List[str]:
238
+ if not text.strip():
239
+ return []
240
+
241
+ prompt = f"""Generate up to {max_tags} relevant tags for the following text.
242
+ Tags should be concise, descriptive keywords or phrases (1-3 words typically) that capture the main topics or themes.
243
+ Return only the tags, separated by commas. Do not include any preamble or explanation.
244
+
245
+ Text:
246
+ {text}
247
+
248
+ Tags:"""
249
+
250
+ try:
251
+ # Use FAST_MODEL for tags
252
+ response = await self.generate_text(prompt, model="fast", max_tokens=100, temperature=1)
253
+ tags = [tag.strip().lower() for tag in response.split(',') if tag.strip()]
254
+ tags = [tag for tag in tags if tag and len(tag) > 1 and len(tag) < 50]
255
+ return list(dict.fromkeys(tags))[:max_tags]
256
+ except Exception as e:
257
+ logger.error(f"Error generating tags: {str(e)}")
258
+ return []
259
+
260
+ async def categorize(self, text: str, categories: List[str]) -> str:
261
+ if not text.strip() or not categories:
262
+ return "Uncategorized"
263
+
264
+ categories_str = ", ".join([f"'{cat}'" for cat in categories])
265
+ prompt = f"""Classify the following text into ONE of these categories: {categories_str}.
266
+ Choose the single most appropriate category based on the content and main theme of the text.
267
+ Return only the category name as a string, exactly as it appears in the list provided. Do not add any other text or explanation.
268
+
269
+ Text to classify:
270
+ {text}
271
+
272
+ Category:"""
273
+
274
+ try:
275
+ # Use FAST_MODEL for categorization
276
+ response = await self.generate_text(prompt, model="fast", max_tokens=50, temperature=0.1)
277
+ category_candidate = response.strip().strip("'\"")
278
+
279
+ for cat in categories:
280
+ if cat.lower() == category_candidate.lower():
281
+ return cat
282
+
283
+ logger.warning(f"LLM returned category '{category_candidate}' which is not in the provided list: {categories}. Falling back.")
284
+ return categories[0] if categories else "Uncategorized"
285
+ except Exception as e:
286
+ logger.error(f"Error categorizing text: {str(e)}")
287
+ return "Uncategorized"
288
+
289
+ async def answer_question(self, question: str, context: str, max_context_length: int = 3000) -> str:
290
+ if not question.strip():
291
+ return "No question provided."
292
+ if not context.strip():
293
+ return "I don't have enough context to answer this question. Please provide relevant information."
294
+
295
+ if len(context) > max_context_length:
296
+ context = context[:max_context_length] + "..."
297
+ logger.warning(f"Context truncated to {max_context_length} characters for question answering.")
298
+
299
+ prompt = f"""You are an expert Q&A assistant. Your task is to synthesize an answer to the user's question based *only* on the provided source documents.
300
+ Analyze all the source documents provided in the context below.
301
+ If the information is present, provide a comprehensive answer.
302
+
303
+ Here are the source documents:
304
+ --- START OF CONTEXT ---
305
+ {context}
306
+ --- END OF CONTEXT ---
307
+
308
+ Based on the context above, please provide a clear and concise answer to the following question.
309
+
310
+ Question: {question}
311
+
312
+ Answer:"""
313
+
314
+ try:
315
+ answer = await self.generate_text(prompt, model="auto", max_tokens=800, temperature=0.5)
316
+ return answer.strip()
317
+ except Exception as e:
318
+ logger.error(f"Error answering question: {str(e)}")
319
+ return "I encountered an error while trying to answer your question."
320
+
321
+ async def extract_key_information(self, text: str) -> Dict[str, Any]:
322
+ if not text.strip():
323
+ return {}
324
+
325
+ prompt = f"""Analyze the following text and extract key information.
326
+ Provide the response as a JSON object with the following keys:
327
+ - "main_topic": (string) The main topic or subject of the text.
328
+ - "key_points": (array of strings) A list of 3-5 key points or takeaways.
329
+ - "entities": (array of strings) Important people, places, organizations, or products mentioned.
330
+ - "sentiment": (string) Overall sentiment of the text (e.g., "positive", "neutral", "negative", "mixed").
331
+ - "content_type": (string) The perceived type of content (e.g., "article", "email", "report", "conversation", "advertisement", "other").
332
+
333
+ If a piece of information is not found or not applicable, use null or an empty array/string as appropriate for the JSON structure.
334
+
335
+ Text to analyze:
336
+ ---
337
+ {text}
338
+ ---
339
+
340
+ JSON Analysis:"""
341
+
342
+ try:
343
+ response_str = await self.generate_text(prompt, model="auto", max_tokens=500, temperature=0.4)
344
+
345
+ import json
346
+ try:
347
+ if response_str.startswith("```json"):
348
+ response_str = response_str.lstrip("```json").rstrip("```").strip()
349
+
350
+ info = json.loads(response_str)
351
+ expected_keys = {"main_topic", "key_points", "entities", "sentiment", "content_type"}
352
+ if not expected_keys.issubset(info.keys()):
353
+ logger.warning(f"Extracted information missing some expected keys. Got: {info.keys()}")
354
+ return info
355
+ except json.JSONDecodeError as je:
356
+ logger.error(f"Failed to parse JSON from LLM response for key_information: {je}")
357
+ logger.debug(f"LLM Response string was: {response_str}")
358
+ info_fallback = {}
359
+ lines = response_str.split('\n')
360
+ for line in lines:
361
+ if ':' in line:
362
+ key, value = line.split(':', 1)
363
+ key_clean = key.strip().lower().replace(' ', '_')
364
+ value_clean = value.strip()
365
+ if value_clean:
366
+ if key_clean in ["key_points", "entities"] and '[' in value_clean and ']' in value_clean:
367
+ try:
368
+ info_fallback[key_clean] = [item.strip().strip("'\"") for item in value_clean.strip('[]').split(',') if item.strip()]
369
+ except: info_fallback[key_clean] = value_clean
370
+ else: info_fallback[key_clean] = value_clean
371
+ if info_fallback:
372
+ logger.info("Successfully parsed key information using fallback line-based method.")
373
+ return info_fallback
374
+ return {"error": "Failed to parse LLM output", "raw_response": response_str}
375
+ except Exception as e:
376
+ logger.error(f"Error extracting key information: {str(e)}")
377
+ return {"error": f"General error extracting key information: {str(e)}"}
378
+
379
+ async def check_availability(self) -> Dict[str, bool]:
380
+ """Check which LLM services are available by making a tiny test call."""
381
+ availability = {
382
+ "openai": False,
383
+ "nebius": False,
384
+ "mistral": False
385
+ }
386
+ test_prompt = "Hello"
387
+ test_max_tokens = 5
388
+ test_temp = 0.1
389
+
390
+ logger.info("Checking LLM availability...")
391
+
392
+ if self.openai_client and self.config.OPENAI_MODEL:
393
+ try:
394
+ logger.debug(f"Testing OpenAI availability with model {self.config.OPENAI_MODEL}...")
395
+ test_response = await self._generate_with_openai(test_prompt, self.config.OPENAI_MODEL, test_max_tokens, test_temp)
396
+ availability["openai"] = bool(test_response.strip())
397
+ except Exception as e:
398
+ logger.warning(f"OpenAI availability check failed for model {self.config.OPENAI_MODEL}: {e}")
399
+ logger.info(f"OpenAI available: {availability['openai']}")
400
+
401
+ if self.nebius_client and self.config.NEBIUS_MODEL:
402
+ try:
403
+ logger.debug(f"Testing NEBIUS availability with model {self.config.NEBIUS_MODEL}...")
404
+ test_response = await self._generate_with_nebius(test_prompt, self.config.NEBIUS_MODEL, test_max_tokens, test_temp)
405
+ availability["nebius"] = bool(test_response.strip())
406
+ except Exception as e:
407
+ logger.warning(f"NEBIUS availability check failed for model {self.config.NEBIUS_MODEL}: {e}")
408
+ logger.info(f"NEBIUS available: {availability['nebius']}")
409
+
410
+ if self.mistral_client and self.config.MISTRAL_MODEL:
411
+ try:
412
+ logger.debug(f"Testing Mistral availability with model {self.config.MISTRAL_MODEL}...")
413
+ test_response = await self._generate_with_mistral(test_prompt, self.config.MISTRAL_MODEL, test_max_tokens, test_temp)
414
+ availability["mistral"] = bool(test_response.strip())
415
+ except Exception as e:
416
+ logger.warning(f"Mistral availability check failed for model {self.config.MISTRAL_MODEL}: {e}")
417
+ logger.info(f"Mistral available: {availability['mistral']}")
418
+
419
+ logger.info(f"Final LLM Availability: {availability}")
420
+ return availability
services/ocr_service.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import asyncio
4
+ from pathlib import Path
5
+ import os
6
+ import base64 # For encoding files
7
+ from typing import Optional, List, Dict, Any
8
+ import json
9
+
10
+ from mistralai import Mistral
11
+ from mistralai.models import SDKError
12
+ # PIL (Pillow) for dummy image creation in main_example
13
+ from PIL import Image, ImageDraw, ImageFont
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class OCRService:
18
+ def __init__(self):
19
+ self.api_key = os.environ.get("MISTRAL_API_KEY")
20
+ if not self.api_key:
21
+ logger.error("MISTRAL_API_KEY environment variable not set.")
22
+ raise ValueError("MISTRAL_API_KEY not found in environment variables.")
23
+
24
+ self.client = Mistral(api_key=self.api_key)
25
+ self.ocr_model_name = "mistral-ocr-latest"
26
+ self.language = 'eng'
27
+ logger.info(f"OCRService (using Mistral AI model {self.ocr_model_name}) initialized.")
28
+
29
+ def _encode_file_to_base64(self, file_path: str) -> Optional[str]:
30
+ try:
31
+ with open(file_path, "rb") as file_to_encode:
32
+ return base64.b64encode(file_to_encode.read()).decode('utf-8')
33
+ except FileNotFoundError:
34
+ logger.error(f"Error: The file {file_path} was not found for Base64 encoding.")
35
+ return None
36
+ except Exception as e:
37
+ logger.error(f"Error during Base64 encoding for {file_path}: {e}")
38
+ return None
39
+
40
+ # In OCRService class:
41
+
42
+ async def _process_file_with_mistral(self, file_path: str, mime_type: str) -> str:
43
+ file_name = Path(file_path).name
44
+ logger.info(f"Preparing to process file: {file_name} (MIME: {mime_type}) with Mistral OCR.")
45
+
46
+ base64_encoded_file = self._encode_file_to_base64(file_path)
47
+ if not base64_encoded_file:
48
+ logger.warning(f"Base64 encoding failed for {file_name}, cannot process.")
49
+ return ""
50
+
51
+ document_type = "image_url" if mime_type.startswith("image/") else "document_url"
52
+ uri_key = "image_url" if document_type == "image_url" else "document_url"
53
+ data_uri = f"data:{mime_type};base64,{base64_encoded_file}"
54
+
55
+ document_payload = {
56
+ "type": document_type,
57
+ uri_key: data_uri
58
+ }
59
+ try:
60
+ logger.info(f"Calling Mistral client.ocr.process for {file_name} with model {self.ocr_model_name}.")
61
+ loop = asyncio.get_event_loop()
62
+
63
+ ocr_response = await loop.run_in_executor(
64
+ None,
65
+ lambda: self.client.ocr.process(
66
+ model=self.ocr_model_name,
67
+ document=document_payload,
68
+ include_image_base64=False
69
+ )
70
+ )
71
+
72
+ logger.info(f"Received OCR response for {file_name}. Type: {type(ocr_response)}")
73
+
74
+ extracted_markdown = ""
75
+ if hasattr(ocr_response, 'pages') and ocr_response.pages and isinstance(ocr_response.pages, list):
76
+ all_pages_markdown = []
77
+ for i, page in enumerate(ocr_response.pages):
78
+ page_content = None
79
+ if hasattr(page, 'markdown') and page.markdown: # Check for 'markdown' attribute
80
+ page_content = page.markdown
81
+ logger.debug(f"Extracted content from page {i} using 'page.markdown'.")
82
+ elif hasattr(page, 'markdown_content') and page.markdown_content:
83
+ page_content = page.markdown_content
84
+ logger.debug(f"Extracted content from page {i} using 'page.markdown_content'.")
85
+ elif hasattr(page, 'text') and page.text:
86
+ page_content = page.text
87
+ logger.debug(f"Extracted content from page {i} using 'page.text'.")
88
+
89
+ if page_content:
90
+ all_pages_markdown.append(page_content)
91
+ else:
92
+ page_details_for_log = str(page)[:200] # Default to string snippet
93
+ if hasattr(page, '__dict__'):
94
+ page_details_for_log = str(vars(page))[:200] # Log part of vars if it's an object
95
+ logger.warning(f"Page {i} in OCR response for {file_name} has no 'markdown', 'markdown_content', or 'text'. Page details: {page_details_for_log}")
96
+
97
+ if all_pages_markdown:
98
+ extracted_markdown = "\n\n---\nPage Break (simulated)\n---\n\n".join(all_pages_markdown) # Simulate page breaks
99
+ else:
100
+ logger.warning(f"'pages' attribute found but no content extracted from any pages for {file_name}.")
101
+
102
+ # Fallbacks if ocr_response doesn't have 'pages' but might have direct text/markdown
103
+ elif hasattr(ocr_response, 'text') and ocr_response.text:
104
+ extracted_markdown = ocr_response.text
105
+ logger.info(f"Extracted content from 'ocr_response.text' (no pages structure) for {file_name}.")
106
+ elif hasattr(ocr_response, 'markdown') and ocr_response.markdown:
107
+ extracted_markdown = ocr_response.markdown
108
+ logger.info(f"Extracted content from 'ocr_response.markdown' (no pages structure) for {file_name}.")
109
+ elif isinstance(ocr_response, str) and ocr_response:
110
+ extracted_markdown = ocr_response
111
+ logger.info(f"OCR response is a direct non-empty string for {file_name}.")
112
+ else:
113
+ logger.warning(f"Could not extract markdown from OCR response for {file_name} using known attributes (pages, text, markdown).")
114
+
115
+ if not extracted_markdown.strip():
116
+ logger.warning(f"Extracted markdown is empty for {file_name} after all parsing attempts.")
117
+
118
+ return extracted_markdown.strip()
119
+
120
+ except SDKError as e:
121
+ logger.error(f"Mistral API Exception during client.ocr.process for {file_name}: {e.message}")
122
+ logger.exception("SDKError details:")
123
+ return ""
124
+ except Exception as e:
125
+ logger.error(f"Generic Exception during Mistral client.ocr.process call for {file_name}: {e}")
126
+ logger.exception("Exception details:")
127
+ return ""
128
+
129
+ async def extract_text_from_image(self, image_path: str, language: Optional[str] = None) -> str:
130
+ if language:
131
+ logger.info(f"Language parameter '{language}' provided, but Mistral OCR is broadly multilingual.")
132
+
133
+ ext = Path(image_path).suffix.lower()
134
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png',
135
+ '.gif': 'image/gif', '.bmp': 'image/bmp', '.tiff': 'image/tiff', '.webp': 'image/webp',
136
+ '.avif': 'image/avif'}
137
+ mime_type = mime_map.get(ext)
138
+ if not mime_type:
139
+ logger.warning(f"Unsupported image extension '{ext}' for path '{image_path}'. Attempting with 'application/octet-stream'.")
140
+ mime_type = 'application/octet-stream'
141
+
142
+ return await self._process_file_with_mistral(image_path, mime_type)
143
+
144
+ async def extract_text_from_pdf(self, pdf_path: str) -> str:
145
+ return await self._process_file_with_mistral(pdf_path, "application/pdf")
146
+
147
+ async def extract_text_from_pdf_images(self, pdf_path: str) -> List[str]:
148
+ logger.info("Mistral processes PDFs directly. This method will return the full Markdown content as a single list item.")
149
+ full_markdown = await self._process_file_with_mistral(pdf_path, "application/pdf")
150
+ if full_markdown:
151
+ return [full_markdown]
152
+ return [""]
153
+
154
+ async def extract_text_with_confidence(self, image_path: str, min_confidence: float = 0.5) -> Dict[str, Any]:
155
+ logger.warning("Mistral Document AI API (ocr.process) typically returns structured text (Markdown). Word-level confidence scores are not standard. 'confidence' field is a placeholder.")
156
+
157
+ ext = Path(image_path).suffix.lower()
158
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
159
+ mime_type = mime_map.get(ext)
160
+ if not mime_type:
161
+ logger.warning(f"Unsupported image extension '{ext}' in extract_text_with_confidence. Defaulting mime type.")
162
+ mime_type = 'application/octet-stream'
163
+
164
+ text_markdown = await self._process_file_with_mistral(image_path, mime_type)
165
+
166
+ return {
167
+ "text": text_markdown,
168
+ "confidence": 0.0,
169
+ "word_count": len(text_markdown.split()) if text_markdown else 0,
170
+ "raw_data": "Mistral ocr.process response contains structured data. See logs from _process_file_with_mistral for details."
171
+ }
172
+
173
+ async def detect_language(self, image_path: str) -> str:
174
+ logger.warning("Mistral OCR is multilingual; explicit language detection is not part of client.ocr.process.")
175
+ return 'eng'
176
+
177
+ async def extract_tables_from_image(self, image_path: str) -> List[List[str]]:
178
+ logger.info("Extracting text (Markdown) from image using Mistral. Mistral OCR preserves table structures in Markdown.")
179
+
180
+ ext = Path(image_path).suffix.lower()
181
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
182
+ mime_type = mime_map.get(ext)
183
+ if not mime_type:
184
+ logger.warning(f"Unsupported image extension '{ext}' in extract_tables_from_image. Defaulting mime type.")
185
+ mime_type = 'application/octet-stream'
186
+
187
+ markdown_content = await self._process_file_with_mistral(image_path, mime_type)
188
+
189
+ if markdown_content:
190
+ logger.info("Attempting basic parsing of Markdown tables. For complex tables, a dedicated parser is recommended.")
191
+ table_data = []
192
+ # Simplified parsing logic for example purposes - can be improved significantly.
193
+ lines = markdown_content.split('\n')
194
+ for line in lines:
195
+ stripped_line = line.strip()
196
+ if stripped_line.startswith('|') and stripped_line.endswith('|') and "---" not in stripped_line:
197
+ cells = [cell.strip() for cell in stripped_line.strip('|').split('|')]
198
+ if any(cells):
199
+ table_data.append(cells)
200
+
201
+ if table_data:
202
+ logger.info(f"Extracted {len(table_data)} lines potentially forming tables using basic parsing.")
203
+ else:
204
+ logger.info("No distinct table structures found with basic parsing from extracted markdown.")
205
+ return table_data
206
+ return []
207
+
208
+ async def get_supported_languages(self) -> List[str]:
209
+ logger.info("Mistral OCR is multilingual. Refer to official Mistral AI documentation for details.")
210
+ return ['eng', 'multilingual (refer to Mistral documentation)']
211
+
212
+ async def validate_ocr_setup(self) -> Dict[str, Any]:
213
+ try:
214
+ models_response = await asyncio.to_thread(self.client.models.list)
215
+ model_ids = [model.id for model in models_response.data]
216
+ return {
217
+ "status": "operational",
218
+ "message": "Mistral client initialized. API key present. Model listing successful.",
219
+ "mistral_available_models_sample": model_ids[:5],
220
+ "configured_ocr_model": self.ocr_model_name,
221
+ }
222
+ except SDKError as e:
223
+ logger.error(f"Mistral API Exception during setup validation: {e.message}")
224
+ return { "status": "error", "error": f"Mistral API Error: {e.message}"}
225
+ except Exception as e:
226
+ logger.error(f"Generic error during Mistral OCR setup validation: {str(e)}")
227
+ return { "status": "error", "error": str(e) }
228
+
229
+ def extract_text(self, file_path: str) -> str:
230
+ logger.warning("`extract_text` is a synchronous method. Running async Mistral OCR in a blocking way.")
231
+ try:
232
+ ext = Path(file_path).suffix.lower()
233
+ if ext in ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.avif']:
234
+ result = asyncio.run(self.extract_text_from_image(file_path))
235
+ elif ext == '.pdf':
236
+ result = asyncio.run(self.extract_text_from_pdf(file_path))
237
+ else:
238
+ logger.error(f"Unsupported file type for sync extract_text: {file_path}")
239
+ return "Unsupported file type."
240
+ return result
241
+ except Exception as e:
242
+ logger.error(f"Error in synchronous extract_text for {file_path}: {str(e)}")
243
+ return "Error during sync extraction."
244
+
245
+ # Example of how to use the OCRService (main execution part)
246
+ async def main_example():
247
+ logging.basicConfig(level=logging.DEBUG,
248
+ format='%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s')
249
+
250
+ if not os.environ.get("MISTRAL_API_KEY"):
251
+ logger.error("MISTRAL_API_KEY environment variable is not set. Please set it: export MISTRAL_API_KEY='yourkey'")
252
+ return
253
+
254
+ ocr_service = OCRService()
255
+
256
+ logger.info("--- Validating OCR Service Setup ---")
257
+ validation_status = await ocr_service.validate_ocr_setup()
258
+ logger.info(f"OCR Service Validation: {validation_status}")
259
+ if validation_status.get("status") == "error":
260
+ logger.error("Halting due to validation error.")
261
+ return
262
+
263
+ # --- Test with a specific PDF file ---
264
+ pdf_path_to_test = r"C:\path\to\your\certificate.pdf"
265
+
266
+ if os.path.exists(pdf_path_to_test):
267
+ logger.info(f"\n--- Extracting text from specific PDF: {pdf_path_to_test} ---")
268
+ # Using the method that aligns with original `extract_text_from_pdf_images` signature
269
+ pdf_markdown_list = await ocr_service.extract_text_from_pdf_images(pdf_path_to_test)
270
+ if pdf_markdown_list and pdf_markdown_list[0]:
271
+ logger.info(f"Extracted Markdown from PDF ({pdf_path_to_test}):\n" + pdf_markdown_list[0])
272
+ else:
273
+ logger.warning(f"No text extracted from PDF {pdf_path_to_test} or an error occurred.")
274
+ else:
275
+ logger.warning(f"PDF file for specific test '{pdf_path_to_test}' not found. Skipping this test.")
276
+ logger.warning("Please update `pdf_path_to_test` in `main_example` to a valid PDF path.")
277
+
278
+ image_path = "dummy_test_image_ocr.png"
279
+ if os.path.exists(image_path):
280
+ logger.info(f"\n---Extracting text from image: {image_path} ---")
281
+ # ... image processing logic ...
282
+ pass
283
+ else:
284
+ logger.info(f"Dummy image {image_path} not created or found, skipping optional image test.")
285
+
286
+
287
+ if __name__ == '__main__':
288
+ asyncio.run(main_example())
services/podcast_generator_service.py ADDED
@@ -0,0 +1,710 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Any, Optional
3
+ from dataclasses import dataclass, asdict
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ import re
7
+ import uuid
8
+
9
+ try:
10
+ from elevenlabs import VoiceSettings
11
+ from elevenlabs.client import ElevenLabs
12
+ ELEVENLABS_AVAILABLE = True
13
+ except ImportError:
14
+ ELEVENLABS_AVAILABLE = False
15
+
16
+ import config
17
+ from services.llamaindex_service import LlamaIndexService
18
+ from services.llm_service import LLMService
19
+ from services.document_store_service import DocumentStoreService
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ @dataclass
24
+ class DocumentAnalysis:
25
+ """Analysis results from document(s)"""
26
+ key_insights: List[str]
27
+ topics: List[str]
28
+ complexity_level: str
29
+ estimated_words: int
30
+ source_documents: List[str]
31
+ summary: str
32
+
33
+ @dataclass
34
+ class DialogueLine:
35
+ """Single line of podcast dialogue"""
36
+ speaker: str
37
+ text: str
38
+ pause_after: float = 0.5
39
+
40
+ @dataclass
41
+ class PodcastScript:
42
+ """Complete podcast script"""
43
+ dialogue: List[DialogueLine]
44
+ total_duration_estimate: float
45
+ word_count: int
46
+ style: str
47
+
48
+ def to_text(self) -> str:
49
+ lines = []
50
+ for line in self.dialogue:
51
+ lines.append(f"{line.speaker}: {line.text}")
52
+ return "\n\n".join(lines)
53
+
54
+ @dataclass
55
+ class PodcastMetadata:
56
+ """Metadata for generated podcast"""
57
+ podcast_id: str
58
+ title: str
59
+ description: str
60
+ source_documents: List[str]
61
+ style: str
62
+ duration_seconds: float
63
+ file_size_mb: float
64
+ voices: Dict[str, str]
65
+ generated_at: str
66
+ generation_cost: Dict[str, float]
67
+ key_topics: List[str]
68
+
69
+ @dataclass
70
+ class PodcastResult:
71
+ """Complete podcast generation result"""
72
+ podcast_id: str
73
+ audio_file_path: str
74
+ transcript: str
75
+ metadata: PodcastMetadata
76
+ generation_time: float
77
+ success: bool
78
+ error: Optional[str] = None
79
+
80
+
81
+ class PodcastGeneratorService:
82
+ """
83
+ Service for generating conversational podcasts from documents.
84
+ """
85
+
86
+ WORDS_PER_MINUTE = 150
87
+
88
+ SCRIPT_PROMPTS = {
89
+ "conversational": """You are an expert podcast script writer. Create an engaging 2-host podcast discussing the provided documents.
90
+
91
+ DOCUMENT CONTENT:
92
+ {document_content}
93
+
94
+ KEY INSIGHTS:
95
+ {key_insights}
96
+
97
+ REQUIREMENTS:
98
+ - Duration: {duration_minutes} minutes (approximately {word_count} words)
99
+ - Style: Conversational, friendly, and accessible
100
+ - Format: Alternating dialogue between HOST1 and HOST2
101
+ - Make the content engaging and easy to understand
102
+ - Include natural transitions and enthusiasm
103
+
104
+ DIALOGUE FORMAT (strictly follow):
105
+ HOST1: [What they say]
106
+ HOST2: [What they say]
107
+
108
+ STRUCTURE:
109
+ 1. Opening Hook (30 seconds): Grab attention
110
+ 2. Introduction (1 minute): Set context
111
+ 3. Main Discussion (70% of time): Deep dive into insights
112
+ 4. Wrap-up (1 minute): Summarize key takeaways
113
+
114
+ Generate the complete podcast script now:""",
115
+
116
+ "educational": """Create an educational podcast discussing the provided documents.
117
+
118
+ DOCUMENT CONTENT:
119
+ {document_content}
120
+
121
+ KEY INSIGHTS:
122
+ {key_insights}
123
+
124
+ REQUIREMENTS:
125
+ - Duration: {duration_minutes} minutes (approximately {word_count} words)
126
+ - Style: Clear, methodical, educational
127
+ - HOST1 acts as teacher, HOST2 as curious learner
128
+
129
+ DIALOGUE FORMAT:
130
+ HOST1: [Expert explanation]
131
+ HOST2: [Clarifying question]
132
+
133
+ Generate the educational podcast script now:""",
134
+
135
+ "technical": """Create a technical podcast for an informed audience.
136
+
137
+ DOCUMENT CONTENT:
138
+ {document_content}
139
+
140
+ KEY INSIGHTS:
141
+ {key_insights}
142
+
143
+ REQUIREMENTS:
144
+ - Duration: {duration_minutes} minutes (approximately {word_count} words)
145
+ - Style: Professional, detailed, technically accurate
146
+ - HOST1 is expert, HOST2 is informed interviewer
147
+
148
+ DIALOGUE FORMAT:
149
+ HOST1: [Technical insight]
150
+ HOST2: [Probing question]
151
+
152
+ Generate the technical podcast script now:""",
153
+
154
+ "casual": """Create a fun, casual podcast discussing the documents.
155
+
156
+ DOCUMENT CONTENT:
157
+ {document_content}
158
+
159
+ KEY INSIGHTS:
160
+ {key_insights}
161
+
162
+ REQUIREMENTS:
163
+ - Duration: {duration_minutes} minutes (approximately {word_count} words)
164
+ - Style: Relaxed, humorous, energetic
165
+ - Make it entertaining while informative
166
+
167
+ DIALOGUE FORMAT:
168
+ HOST1: [Casual commentary]
169
+ HOST2: [Enthusiastic response]
170
+
171
+ Generate the casual podcast script now:"""
172
+ }
173
+
174
+ def __init__(
175
+ self,
176
+ llamaindex_service: LlamaIndexService,
177
+ llm_service: LLMService,
178
+ elevenlabs_api_key: Optional[str] = None
179
+ ):
180
+ self.config = config.config
181
+ self.llamaindex_service = llamaindex_service
182
+ self.llm_service = llm_service
183
+
184
+ # Get document store from llamaindex service
185
+ self.document_store = llamaindex_service.document_store
186
+
187
+ # Initialize ElevenLabs client
188
+ self.elevenlabs_client = None
189
+ if ELEVENLABS_AVAILABLE:
190
+ api_key = elevenlabs_api_key or self.config.ELEVENLABS_API_KEY
191
+ if api_key:
192
+ try:
193
+ self.elevenlabs_client = ElevenLabs(api_key=api_key)
194
+ logger.info("ElevenLabs client initialized for podcast generation")
195
+ except Exception as e:
196
+ logger.error(f"Failed to initialize ElevenLabs client: {e}")
197
+
198
+ # Create podcast storage directory
199
+ self.podcast_dir = Path("./data/podcasts")
200
+ self.podcast_dir.mkdir(parents=True, exist_ok=True)
201
+
202
+ # Metadata database file
203
+ self.metadata_file = self.podcast_dir / "metadata_db.json"
204
+ self._ensure_metadata_db()
205
+
206
+ # Voice cache
207
+ self._voice_cache = {}
208
+
209
+ def _ensure_metadata_db(self):
210
+ """Ensure metadata database exists"""
211
+ if not self.metadata_file.exists():
212
+ import json
213
+ self.metadata_file.write_text(json.dumps([], indent=2))
214
+
215
+ async def generate_podcast(
216
+ self,
217
+ document_ids: List[str],
218
+ style: str = "conversational",
219
+ duration_minutes: int = 10,
220
+ host1_voice: str = "Rachel",
221
+ host2_voice: str = "Adam"
222
+ ) -> PodcastResult:
223
+ """Generate a complete podcast from documents"""
224
+ start_time = datetime.now()
225
+ podcast_id = str(uuid.uuid4())
226
+
227
+ try:
228
+ logger.info(f"Starting podcast generation {podcast_id}")
229
+ logger.info(f"Documents: {document_ids}, Style: {style}, Duration: {duration_minutes}min")
230
+
231
+ # Step 1: Retrieve and analyze documents
232
+ logger.info("Step 1: Retrieving and analyzing documents...")
233
+ analysis = await self.analyze_documents(document_ids)
234
+
235
+ # Step 2: Generate script
236
+ logger.info("Step 2: Generating podcast script...")
237
+ script = await self.generate_script(analysis, style, duration_minutes)
238
+
239
+ # Step 3: Synthesize audio
240
+ logger.info("Step 3: Synthesizing audio with voices...")
241
+ audio_file_path = await self.synthesize_audio(
242
+ podcast_id,
243
+ script,
244
+ host1_voice,
245
+ host2_voice
246
+ )
247
+
248
+ # Calculate generation time
249
+ generation_time = (datetime.now() - start_time).total_seconds()
250
+
251
+ # Step 4: Create metadata
252
+ logger.info("Step 4: Creating metadata...")
253
+ metadata = self._create_metadata(
254
+ podcast_id,
255
+ analysis,
256
+ script,
257
+ audio_file_path,
258
+ {host1_voice, host2_voice},
259
+ document_ids,
260
+ style
261
+ )
262
+
263
+ # Save metadata
264
+ self._save_metadata(metadata)
265
+
266
+ # Save transcript
267
+ transcript_path = self.podcast_dir / f"{podcast_id}_transcript.txt"
268
+ transcript_path.write_text(script.to_text(), encoding="utf-8")
269
+
270
+ logger.info(f"Podcast generated successfully: {podcast_id}")
271
+
272
+ return PodcastResult(
273
+ podcast_id=podcast_id,
274
+ audio_file_path=str(audio_file_path),
275
+ transcript=script.to_text(),
276
+ metadata=metadata,
277
+ generation_time=generation_time,
278
+ success=True
279
+ )
280
+
281
+ except Exception as e:
282
+ logger.error(f"Podcast generation failed: {str(e)}", exc_info=True)
283
+ return PodcastResult(
284
+ podcast_id=podcast_id,
285
+ audio_file_path="",
286
+ transcript="",
287
+ metadata=None,
288
+ generation_time=(datetime.now() - start_time).total_seconds(),
289
+ success=False,
290
+ error=str(e)
291
+ )
292
+
293
+ async def analyze_documents(self, document_ids: List[str]) -> DocumentAnalysis:
294
+ """
295
+ Retrieve documents and extract key insights for podcast
296
+
297
+ FIXED: Now actually retrieves document content from document store
298
+ """
299
+ try:
300
+ # Step 1: Retrieve actual documents from document store
301
+ logger.info(f"Retrieving {len(document_ids)} documents from store...")
302
+ documents = []
303
+ document_contents = []
304
+
305
+ for doc_id in document_ids:
306
+ doc = await self.document_store.get_document(doc_id)
307
+ if doc:
308
+ documents.append(doc)
309
+ document_contents.append(doc.content)
310
+ logger.info(f"Retrieved document: {doc.filename} ({len(doc.content)} chars)")
311
+ else:
312
+ logger.warning(f"Document {doc_id} not found in store")
313
+
314
+ if not documents:
315
+ raise ValueError(f"No documents found for IDs: {document_ids}")
316
+
317
+ # Step 2: Combine document content
318
+ combined_content = "\n\n---DOCUMENT SEPARATOR---\n\n".join(document_contents)
319
+
320
+ # Truncate if too long (keep first portion for context)
321
+ max_content_length = 15000 # Adjust based on your LLM context window
322
+ if len(combined_content) > max_content_length:
323
+ logger.warning(f"Content too long ({len(combined_content)} chars), truncating to {max_content_length}")
324
+ combined_content = combined_content[:max_content_length] + "\n\n[Content truncated...]"
325
+
326
+ # Step 3: Use LLM to analyze the content
327
+ analysis_prompt = f"""Analyze the following document(s) and provide:
328
+
329
+ 1. The 5-7 most important insights or key points (be specific and detailed)
330
+ 2. Main themes and topics covered
331
+ 3. The overall complexity level (beginner/intermediate/advanced)
332
+ 4. A comprehensive summary suitable for podcast discussion
333
+
334
+ DOCUMENTS:
335
+ {combined_content}
336
+
337
+ Provide a structured analysis optimized for creating an engaging podcast discussion.
338
+ Format your response as:
339
+
340
+ KEY INSIGHTS:
341
+ 1. [First key insight]
342
+ 2. [Second key insight]
343
+ ...
344
+
345
+ TOPICS:
346
+ - [Topic 1]
347
+ - [Topic 2]
348
+ ...
349
+
350
+ COMPLEXITY: [beginner/intermediate/advanced]
351
+
352
+ SUMMARY:
353
+ [Your comprehensive summary here]
354
+ """
355
+
356
+ logger.info("Analyzing content with LLM...")
357
+ result = await self.llm_service.generate_text(
358
+ analysis_prompt,
359
+ max_tokens=2000,
360
+ temperature=0.7
361
+ )
362
+
363
+ # Step 4: Parse the structured response
364
+ insights = self._extract_insights(result)
365
+ topics = self._extract_topics(result)
366
+ complexity = self._determine_complexity(result)
367
+ summary = self._extract_summary(result)
368
+
369
+ logger.info(f"Analysis complete: {len(insights)} insights, {len(topics)} topics")
370
+
371
+ return DocumentAnalysis(
372
+ key_insights=insights[:7],
373
+ topics=topics,
374
+ complexity_level=complexity,
375
+ estimated_words=len(combined_content.split()),
376
+ source_documents=[doc.filename for doc in documents],
377
+ summary=summary or result[:500]
378
+ )
379
+
380
+ except Exception as e:
381
+ logger.error(f"Document analysis failed: {str(e)}", exc_info=True)
382
+ raise RuntimeError(f"Failed to analyze documents: {str(e)}")
383
+
384
+ def _extract_summary(self, text: str) -> str:
385
+ """Extract summary section from analysis"""
386
+ try:
387
+ if "SUMMARY:" in text:
388
+ parts = text.split("SUMMARY:")
389
+ if len(parts) > 1:
390
+ summary = parts[1].strip()
391
+ # Take first 500 chars if too long
392
+ return summary[:500] if len(summary) > 500 else summary
393
+ except:
394
+ pass
395
+
396
+ # Fallback: take first few sentences
397
+ sentences = text.split('.')
398
+ return '. '.join(sentences[:3]) + '.'
399
+
400
+ def _extract_insights(self, text: str) -> List[str]:
401
+ """Extract key insights from analysis text"""
402
+ insights = []
403
+ lines = text.split('\n')
404
+
405
+ in_insights_section = False
406
+ for line in lines:
407
+ line = line.strip()
408
+
409
+ if "KEY INSIGHTS:" in line.upper():
410
+ in_insights_section = True
411
+ continue
412
+ elif line.upper().startswith(("TOPICS:", "COMPLEXITY:", "SUMMARY:")):
413
+ in_insights_section = False
414
+
415
+ if in_insights_section and line:
416
+ # Match patterns like "1.", "2.", "-", "*", "•"
417
+ insight = re.sub(r'^\d+\.|\-|\*|•', '', line).strip()
418
+ if len(insight) > 20:
419
+ insights.append(insight)
420
+
421
+ # Fallback if no insights found
422
+ if not insights:
423
+ sentences = text.split('.')
424
+ insights = [s.strip() + '.' for s in sentences[:7] if len(s.strip()) > 20]
425
+
426
+ return insights
427
+
428
+ def _extract_topics(self, text: str) -> List[str]:
429
+ """Extract main topics from analysis"""
430
+ topics = []
431
+ lines = text.split('\n')
432
+
433
+ in_topics_section = False
434
+ for line in lines:
435
+ line = line.strip()
436
+
437
+ if "TOPICS:" in line.upper():
438
+ in_topics_section = True
439
+ continue
440
+ elif line.upper().startswith(("KEY INSIGHTS:", "COMPLEXITY:", "SUMMARY:")):
441
+ in_topics_section = False
442
+
443
+ if in_topics_section and line:
444
+ topic = re.sub(r'^\-|\*|•', '', line).strip()
445
+ if len(topic) > 2:
446
+ topics.append(topic)
447
+
448
+ # Fallback: simple keyword extraction
449
+ if not topics:
450
+ common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
451
+ words = text.lower().split()
452
+ word_freq = {}
453
+
454
+ for word in words:
455
+ word = re.sub(r'[^\w\s]', '', word)
456
+ if len(word) > 4 and word not in common_words:
457
+ word_freq[word] = word_freq.get(word, 0) + 1
458
+
459
+ top_topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
460
+ topics = [topic[0].title() for topic in top_topics]
461
+
462
+ return topics[:5]
463
+
464
+ def _determine_complexity(self, text: str) -> str:
465
+ """Determine content complexity level"""
466
+ text_lower = text.lower()
467
+
468
+ if "complexity:" in text_lower:
469
+ for level in ["beginner", "intermediate", "advanced"]:
470
+ if level in text_lower.split("complexity:")[1][:100]:
471
+ return level
472
+
473
+ # Heuristic based on keywords
474
+ if any(word in text_lower for word in ['basic', 'introduction', 'beginner', 'simple']):
475
+ return "beginner"
476
+ elif any(word in text_lower for word in ['advanced', 'complex', 'sophisticated', 'expert']):
477
+ return "advanced"
478
+ else:
479
+ return "intermediate"
480
+
481
+ async def generate_script(
482
+ self,
483
+ analysis: DocumentAnalysis,
484
+ style: str,
485
+ duration_minutes: int
486
+ ) -> PodcastScript:
487
+ """Generate podcast script from analysis"""
488
+ target_words = duration_minutes * self.WORDS_PER_MINUTE
489
+
490
+ # Prepare context with insights
491
+ insights_text = "\n".join(f"{i+1}. {insight}" for i, insight in enumerate(analysis.key_insights))
492
+
493
+ # Get prompt template
494
+ prompt_template = self.SCRIPT_PROMPTS.get(style, self.SCRIPT_PROMPTS["conversational"])
495
+
496
+ # Fill template
497
+ prompt = prompt_template.format(
498
+ document_content=analysis.summary,
499
+ key_insights=insights_text,
500
+ duration_minutes=duration_minutes,
501
+ word_count=target_words
502
+ )
503
+
504
+ # Generate script
505
+ script_text = await self.llm_service.generate_text(
506
+ prompt,
507
+ max_tokens=target_words * 2,
508
+ temperature=0.8
509
+ )
510
+
511
+ # Parse into dialogue
512
+ dialogue = self._parse_script(script_text)
513
+
514
+ if not dialogue:
515
+ raise ValueError("Failed to parse script into dialogue lines")
516
+
517
+ word_count = sum(len(line.text.split()) for line in dialogue)
518
+ duration_estimate = word_count / self.WORDS_PER_MINUTE
519
+
520
+ return PodcastScript(
521
+ dialogue=dialogue,
522
+ total_duration_estimate=duration_estimate * 60,
523
+ word_count=word_count,
524
+ style=style
525
+ )
526
+
527
+ def _parse_script(self, script_text: str) -> List[DialogueLine]:
528
+ """Parse generated script into dialogue lines"""
529
+ dialogue = []
530
+ lines = script_text.split('\n')
531
+
532
+ for line in lines:
533
+ line = line.strip()
534
+ if not line:
535
+ continue
536
+
537
+ if line.startswith('HOST1:'):
538
+ text = line[6:].strip()
539
+ if text:
540
+ dialogue.append(DialogueLine(speaker="HOST1", text=text))
541
+ elif line.startswith('HOST2:'):
542
+ text = line[6:].strip()
543
+ if text:
544
+ dialogue.append(DialogueLine(speaker="HOST2", text=text))
545
+
546
+ return dialogue
547
+
548
+ def _get_voice_id(self, voice_name: str) -> str:
549
+ """Get voice ID from voice name"""
550
+ try:
551
+ # Use cache if available
552
+ if not self._voice_cache:
553
+ voices = self.elevenlabs_client.voices.get_all()
554
+ if not voices or not voices.voices:
555
+ raise RuntimeError("No voices available")
556
+
557
+ for voice in voices.voices:
558
+ self._voice_cache[voice.name.lower()] = voice.voice_id
559
+
560
+ # Exact match
561
+ if voice_name.lower() in self._voice_cache:
562
+ return self._voice_cache[voice_name.lower()]
563
+
564
+ # Partial match
565
+ for name, voice_id in self._voice_cache.items():
566
+ if voice_name.lower() in name:
567
+ logger.info(f"Partial match for '{voice_name}': {name}")
568
+ return voice_id
569
+
570
+ # Fallback
571
+ first_voice_id = list(self._voice_cache.values())[0]
572
+ logger.warning(f"Voice '{voice_name}' not found, using default")
573
+ return first_voice_id
574
+
575
+ except Exception as e:
576
+ logger.error(f"Could not fetch voices: {e}")
577
+ raise RuntimeError(f"Failed to get voice ID: {str(e)}")
578
+
579
+ async def synthesize_audio(
580
+ self,
581
+ podcast_id: str,
582
+ script: PodcastScript,
583
+ host1_voice: str,
584
+ host2_voice: str
585
+ ) -> Path:
586
+ """Synthesize audio with alternating voices"""
587
+ if not self.elevenlabs_client:
588
+ raise RuntimeError("ElevenLabs client not initialized")
589
+
590
+ audio_file = self.podcast_dir / f"{podcast_id}.mp3"
591
+
592
+ try:
593
+ # Get voice IDs
594
+ host1_voice_id = self._get_voice_id(host1_voice)
595
+ host2_voice_id = self._get_voice_id(host2_voice)
596
+
597
+ logger.info(f"HOST1: {host1_voice}, HOST2: {host2_voice}")
598
+
599
+ voice_map = {
600
+ "HOST1": host1_voice_id,
601
+ "HOST2": host2_voice_id
602
+ }
603
+
604
+ audio_chunks = []
605
+
606
+ # Process each line with correct voice
607
+ for i, line in enumerate(script.dialogue):
608
+ logger.info(f"Line {i+1}/{len(script.dialogue)}: {line.speaker}")
609
+
610
+ voice_id = voice_map.get(line.speaker, host1_voice_id)
611
+
612
+ audio_generator = self.elevenlabs_client.text_to_speech.convert(
613
+ voice_id=voice_id,
614
+ text=line.text,
615
+ model_id="eleven_multilingual_v2"
616
+ )
617
+
618
+ line_chunks = []
619
+ for chunk in audio_generator:
620
+ if chunk:
621
+ line_chunks.append(chunk)
622
+
623
+ if line_chunks:
624
+ audio_chunks.append(b''.join(line_chunks))
625
+
626
+ if not audio_chunks:
627
+ raise RuntimeError("No audio chunks generated")
628
+
629
+ full_audio = b''.join(audio_chunks)
630
+
631
+ with open(audio_file, 'wb') as f:
632
+ f.write(full_audio)
633
+
634
+ if audio_file.exists() and audio_file.stat().st_size > 1000:
635
+ logger.info(f"Audio created: {audio_file} ({audio_file.stat().st_size} bytes)")
636
+ return audio_file
637
+ else:
638
+ raise RuntimeError("Audio file too small or empty")
639
+
640
+ except Exception as e:
641
+ logger.error(f"Audio synthesis failed: {e}", exc_info=True)
642
+ raise RuntimeError(f"Failed to generate audio: {str(e)}")
643
+
644
+ def _create_metadata(
645
+ self,
646
+ podcast_id: str,
647
+ analysis: DocumentAnalysis,
648
+ script: PodcastScript,
649
+ audio_path: Path,
650
+ voices: set,
651
+ document_ids: List[str],
652
+ style: str
653
+ ) -> PodcastMetadata:
654
+ """Create podcast metadata"""
655
+ title = f"Podcast: {analysis.topics[0] if analysis.topics else 'Document Discussion'}"
656
+ description = f"A {style} podcast discussing: {', '.join(analysis.source_documents)}"
657
+ file_size_mb = audio_path.stat().st_size / (1024 * 1024) if audio_path.exists() else 0
658
+
659
+ llm_cost = (script.word_count / 1000) * 0.01
660
+ tts_cost = (script.word_count * 5 / 1000) * 0.30
661
+
662
+ return PodcastMetadata(
663
+ podcast_id=podcast_id,
664
+ title=title,
665
+ description=description,
666
+ source_documents=analysis.source_documents,
667
+ style=style,
668
+ duration_seconds=script.total_duration_estimate,
669
+ file_size_mb=file_size_mb,
670
+ voices={"host1": list(voices)[0] if len(voices) > 0 else "Rachel",
671
+ "host2": list(voices)[1] if len(voices) > 1 else "Adam"},
672
+ generated_at=datetime.now().isoformat(),
673
+ generation_cost={"llm_cost": llm_cost, "tts_cost": tts_cost, "total": llm_cost + tts_cost},
674
+ key_topics=analysis.topics
675
+ )
676
+
677
+ def _save_metadata(self, metadata: PodcastMetadata):
678
+ """Save metadata to database"""
679
+ try:
680
+ import json
681
+ existing = json.loads(self.metadata_file.read_text())
682
+ existing.append(asdict(metadata))
683
+ self.metadata_file.write_text(json.dumps(existing, indent=2))
684
+ logger.info(f"Metadata saved: {metadata.podcast_id}")
685
+ except Exception as e:
686
+ logger.error(f"Failed to save metadata: {e}")
687
+
688
+ def list_podcasts(self, limit: int = 10) -> List[PodcastMetadata]:
689
+ """List generated podcasts"""
690
+ try:
691
+ import json
692
+ data = json.loads(self.metadata_file.read_text())
693
+ podcasts = [PodcastMetadata(**item) for item in data[-limit:]]
694
+ return list(reversed(podcasts))
695
+ except Exception as e:
696
+ logger.error(f"Failed to list podcasts: {e}")
697
+ return []
698
+
699
+ def get_podcast(self, podcast_id: str) -> Optional[PodcastMetadata]:
700
+ """Get specific podcast metadata"""
701
+ try:
702
+ import json
703
+ data = json.loads(self.metadata_file.read_text())
704
+ for item in data:
705
+ if item.get('podcast_id') == podcast_id:
706
+ return PodcastMetadata(**item)
707
+ return None
708
+ except Exception as e:
709
+ logger.error(f"Failed to get podcast: {e}")
710
+ return None
services/vector_store_service.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import pickle
4
+ import numpy as np
5
+ from typing import List, Dict, Any, Optional, Tuple
6
+ import faiss
7
+ from pathlib import Path
8
+ import asyncio
9
+ import json
10
+
11
+ from core.models import SearchResult, Chunk
12
+ import config
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class VectorStoreService:
17
+ def __init__(self):
18
+ self.config = config.config
19
+ self.index = None
20
+ self.chunks_metadata = {} # Maps index position to chunk metadata
21
+ self.dimension = None
22
+
23
+ # Paths
24
+ self.store_path = Path(self.config.VECTOR_STORE_PATH)
25
+ self.store_path.mkdir(parents=True, exist_ok=True)
26
+
27
+ self.index_path = self.store_path / f"{self.config.INDEX_NAME}.index"
28
+ self.metadata_path = self.store_path / f"{self.config.INDEX_NAME}_metadata.json"
29
+
30
+ # Load existing index if available
31
+ self._load_index()
32
+
33
+ def _load_index(self):
34
+ """Load existing FAISS index and metadata"""
35
+ try:
36
+ if self.index_path.exists() and self.metadata_path.exists():
37
+ logger.info("Loading existing FAISS index...")
38
+
39
+ # Load FAISS index
40
+ self.index = faiss.read_index(str(self.index_path))
41
+ self.dimension = self.index.d
42
+
43
+ # Load metadata
44
+ with open(self.metadata_path, 'r') as f:
45
+ self.chunks_metadata = json.load(f)
46
+
47
+ logger.info(f"Loaded index with {self.index.ntotal} vectors, dimension {self.dimension}")
48
+ else:
49
+ logger.info("No existing index found, will create new one")
50
+ except Exception as e:
51
+ logger.error(f"Error loading index: {str(e)}")
52
+
53
+ def _initialize_index(self, dimension: int):
54
+ """Initialize a new FAISS index"""
55
+ try:
56
+ # Use IndexFlatIP for cosine similarity (since embeddings are normalized)
57
+ self.index = faiss.IndexFlatIP(dimension)
58
+ self.dimension = dimension
59
+ self.chunks_metadata = {}
60
+ logger.info(f"Initialized new FAISS index with dimension {dimension}")
61
+ except Exception as e:
62
+ logger.error(f"Error initializing index: {str(e)}")
63
+ raise
64
+
65
+ async def add_chunks(self, chunks: List[Chunk]) -> bool:
66
+ """Add chunks to the vector store"""
67
+ if not chunks:
68
+ return True
69
+
70
+ try:
71
+ # Extract embeddings and metadata
72
+ embeddings = []
73
+ new_metadata = {}
74
+
75
+ for chunk in chunks:
76
+ if chunk.embedding and len(chunk.embedding) > 0:
77
+ embeddings.append(chunk.embedding)
78
+ # Store metadata using the current index position
79
+ current_index = len(self.chunks_metadata) + len(embeddings) - 1
80
+ new_metadata[str(current_index)] = {
81
+ "chunk_id": chunk.id,
82
+ "document_id": chunk.document_id,
83
+ "content": chunk.content,
84
+ "chunk_index": chunk.chunk_index,
85
+ "start_pos": chunk.start_pos,
86
+ "end_pos": chunk.end_pos,
87
+ "metadata": chunk.metadata
88
+ }
89
+
90
+ if not embeddings:
91
+ logger.warning("No valid embeddings found in chunks")
92
+ return False
93
+
94
+ # Check for dimension mismatch
95
+ if self.index is not None and self.dimension is not None:
96
+ if len(embeddings[0]) != self.dimension:
97
+ logger.warning(f"Dimension mismatch! New embeddings have {len(embeddings[0])}, but index has {self.dimension}. Rebuilding index.")
98
+ # Reset index
99
+ self.index = None
100
+ self.chunks_metadata = {}
101
+ self.dimension = None
102
+
103
+ # Initialize index if needed
104
+ if self.index is None:
105
+ self._initialize_index(len(embeddings[0]))
106
+
107
+ # Convert to numpy array
108
+ embeddings_array = np.array(embeddings, dtype=np.float32)
109
+
110
+ # Add to FAISS index
111
+ self.index.add(embeddings_array)
112
+
113
+ # Update metadata
114
+ self.chunks_metadata.update(new_metadata)
115
+
116
+ # Save index and metadata
117
+ await self._save_index()
118
+
119
+ logger.info(f"Added {len(embeddings)} chunks to vector store")
120
+ return True
121
+
122
+ except Exception as e:
123
+ logger.error(f"Error adding chunks to vector store: {str(e)}")
124
+ return False
125
+
126
+ async def search(self, query_embedding: List[float], top_k: int = 5,
127
+ filters: Optional[Dict[str, Any]] = None) -> List[SearchResult]:
128
+ """Search for similar chunks"""
129
+ if self.index is None or self.index.ntotal == 0:
130
+ logger.warning("No index available or index is empty")
131
+ return []
132
+
133
+ try:
134
+ # Convert query embedding to numpy array
135
+ query_array = np.array([query_embedding], dtype=np.float32)
136
+
137
+ # Perform search
138
+ scores, indices = self.index.search(query_array, min(top_k, self.index.ntotal))
139
+
140
+ # Convert results to SearchResult objects
141
+ results = []
142
+ for score, idx in zip(scores[0], indices[0]):
143
+ if idx == -1: # FAISS returns -1 for empty slots
144
+ continue
145
+
146
+ chunk_metadata = self.chunks_metadata.get(str(idx))
147
+ if chunk_metadata:
148
+ # Apply filters if specified
149
+ if filters and not self._apply_filters(chunk_metadata, filters):
150
+ continue
151
+
152
+ result = SearchResult(
153
+ chunk_id=chunk_metadata["chunk_id"],
154
+ document_id=chunk_metadata["document_id"],
155
+ content=chunk_metadata["content"],
156
+ score=float(score),
157
+ metadata=chunk_metadata.get("metadata", {})
158
+ )
159
+ results.append(result)
160
+
161
+ # Sort by score (descending)
162
+ results.sort(key=lambda x: x.score, reverse=True)
163
+
164
+ logger.info(f"Found {len(results)} search results")
165
+ return results
166
+
167
+ except Exception as e:
168
+ logger.error(f"Error searching vector store: {str(e)}")
169
+ return []
170
+
171
+ def _apply_filters(self, chunk_metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
172
+ """Apply filters to chunk metadata"""
173
+ try:
174
+ for key, value in filters.items():
175
+ if key == "document_id":
176
+ if chunk_metadata.get("document_id") != value:
177
+ return False
178
+ elif key == "document_ids":
179
+ if chunk_metadata.get("document_id") not in value:
180
+ return False
181
+ elif key == "content_length_min":
182
+ if len(chunk_metadata.get("content", "")) < value:
183
+ return False
184
+ elif key == "content_length_max":
185
+ if len(chunk_metadata.get("content", "")) > value:
186
+ return False
187
+ # Add more filter types as needed
188
+
189
+ return True
190
+ except Exception as e:
191
+ logger.error(f"Error applying filters: {str(e)}")
192
+ return True
193
+
194
+ async def _save_index(self):
195
+ """Save the FAISS index and metadata to disk"""
196
+ try:
197
+ if self.index is not None:
198
+ # Save FAISS index
199
+ faiss.write_index(self.index, str(self.index_path))
200
+
201
+ # Save metadata
202
+ with open(self.metadata_path, 'w') as f:
203
+ json.dump(self.chunks_metadata, f, indent=2)
204
+
205
+ logger.debug("Saved index and metadata to disk")
206
+ except Exception as e:
207
+ logger.error(f"Error saving index: {str(e)}")
208
+
209
+ async def get_stats(self) -> Dict[str, Any]:
210
+ """Get statistics about the vector store"""
211
+ try:
212
+ return {
213
+ "total_vectors": self.index.ntotal if self.index else 0,
214
+ "dimension": self.dimension,
215
+ "index_type": type(self.index).__name__ if self.index else None,
216
+ "metadata_entries": len(self.chunks_metadata),
217
+ "index_file_exists": self.index_path.exists(),
218
+ "metadata_file_exists": self.metadata_path.exists()
219
+ }
220
+ except Exception as e:
221
+ logger.error(f"Error getting stats: {str(e)}")
222
+ return {"error": str(e)}
223
+
224
+ async def delete_document(self, document_id: str) -> bool:
225
+ """Delete all chunks for a specific document"""
226
+ try:
227
+ # Find indices to remove
228
+ indices_to_remove = []
229
+ for idx, metadata in self.chunks_metadata.items():
230
+ if metadata.get("document_id") == document_id:
231
+ indices_to_remove.append(int(idx))
232
+
233
+ if not indices_to_remove:
234
+ logger.warning(f"No chunks found for document {document_id}")
235
+ return False
236
+
237
+ # FAISS doesn't support removing individual vectors efficiently
238
+ # We need to rebuild the index without the removed vectors
239
+ if self.index and self.index.ntotal > 0:
240
+ # Get all embeddings except the ones to remove
241
+ all_embeddings = []
242
+ new_metadata = {}
243
+ new_index = 0
244
+
245
+ for old_idx in range(self.index.ntotal):
246
+ if old_idx not in indices_to_remove:
247
+ # Get the embedding from FAISS
248
+ embedding = self.index.reconstruct(old_idx)
249
+ all_embeddings.append(embedding)
250
+
251
+ # Update metadata with new index
252
+ old_metadata = self.chunks_metadata.get(str(old_idx))
253
+ if old_metadata:
254
+ new_metadata[str(new_index)] = old_metadata
255
+ new_index += 1
256
+
257
+ # Rebuild index
258
+ if all_embeddings:
259
+ self._initialize_index(self.dimension)
260
+ embeddings_array = np.array(all_embeddings, dtype=np.float32)
261
+ self.index.add(embeddings_array)
262
+ self.chunks_metadata = new_metadata
263
+ else:
264
+ # No embeddings left, create empty index
265
+ self._initialize_index(self.dimension)
266
+
267
+ # Save updated index
268
+ await self._save_index()
269
+
270
+ logger.info(f"Deleted {len(indices_to_remove)} chunks for document {document_id}")
271
+ return True
272
+
273
+ except Exception as e:
274
+ logger.error(f"Error deleting document chunks: {str(e)}")
275
+ return False
276
+
277
+ async def clear_all(self) -> bool:
278
+ """Clear all data from the vector store"""
279
+ try:
280
+ self.index = None
281
+ self.chunks_metadata = {}
282
+ self.dimension = None
283
+
284
+ # Remove files
285
+ if self.index_path.exists():
286
+ self.index_path.unlink()
287
+ if self.metadata_path.exists():
288
+ self.metadata_path.unlink()
289
+
290
+ logger.info("Cleared all data from vector store")
291
+ return True
292
+ except Exception as e:
293
+ logger.error(f"Error clearing vector store: {str(e)}")
294
+ return False