Complete SOCAR Document Processing System for Hackathon
Browse filesImplemented full-stack solution for processing historical oil & gas documents:
Features:
- OCR Endpoint: Azure Document Intelligence for multi-language PDFs (Azerbaijani, Russian, handwriting)
- LLM Endpoint: RAG-based chatbot with Llama-4-Maverick-17B (open-source, optimized for LLM Judge)
- Vector Database: ChromaDB with sentence-transformers embeddings
- FastAPI: Async REST API with comprehensive error handling
- Docker: Multi-stage containerization with health checks
- Performance: ~2.6s LLM response time, optimized for quality answers
Architecture:
- OCR: Azure Document Intelligence (50% of score)
- RAG: 3-document retrieval, 600-char chunks, 100-char overlap
- LLM: Temperature 0.2, max_tokens 1000, optimized prompts for citations
- Embeddings: all-MiniLM-L6-v2 (lightweight, efficient)
- Deployment: Docker Compose, nginx-ready
Optimizations:
- LLM Judge criteria: Accuracy, Relevance, Completeness, Citations
- Open-source stack for architecture scores (20%)
- Production-ready with favicon, health checks, auto-restart
π€ Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
- .dockerignore +55 -0
- .gitignore +32 -0
- Dockerfile +56 -0
- README.md +195 -0
- docker-compose.yml +43 -0
- ingest_pdfs.py +87 -0
- requirements.txt +47 -0
- run.py +18 -0
- src/__init__.py +0 -0
- src/api/__init__.py +0 -0
- src/api/main.py +181 -0
- src/api/models.py +48 -0
- src/config.py +39 -0
- src/llm/__init__.py +0 -0
- src/llm/deepseek_client.py +126 -0
- src/llm/rag_pipeline.py +154 -0
- src/ocr/__init__.py +0 -0
- src/ocr/azure_ocr.py +81 -0
- src/ocr/processor.py +62 -0
- src/vectordb/__init__.py +0 -0
- src/vectordb/chroma_store.py +150 -0
- start.sh +81 -0
- test_complete_system.py +128 -0
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
*.egg
|
| 11 |
+
|
| 12 |
+
# Virtual environments
|
| 13 |
+
venv/
|
| 14 |
+
env/
|
| 15 |
+
ENV/
|
| 16 |
+
|
| 17 |
+
# IDE
|
| 18 |
+
.vscode/
|
| 19 |
+
.idea/
|
| 20 |
+
*.swp
|
| 21 |
+
*.swo
|
| 22 |
+
|
| 23 |
+
# Git
|
| 24 |
+
.git/
|
| 25 |
+
.gitignore
|
| 26 |
+
|
| 27 |
+
# Testing
|
| 28 |
+
.pytest_cache/
|
| 29 |
+
.coverage
|
| 30 |
+
htmlcov/
|
| 31 |
+
|
| 32 |
+
# Documentation
|
| 33 |
+
docs/
|
| 34 |
+
*.md
|
| 35 |
+
!README.md
|
| 36 |
+
|
| 37 |
+
# Data (can be mounted as volumes)
|
| 38 |
+
data/pdfs/*
|
| 39 |
+
data/vector_db/*
|
| 40 |
+
data/processed/*
|
| 41 |
+
|
| 42 |
+
# Test files
|
| 43 |
+
test_*.py
|
| 44 |
+
*_test.py
|
| 45 |
+
|
| 46 |
+
# Logs
|
| 47 |
+
*.log
|
| 48 |
+
|
| 49 |
+
# OS
|
| 50 |
+
.DS_Store
|
| 51 |
+
Thumbs.db
|
| 52 |
+
|
| 53 |
+
# Temporary files
|
| 54 |
+
*.tmp
|
| 55 |
+
*.bak
|
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.claude
|
| 2 |
+
/docs
|
| 3 |
+
/data
|
| 4 |
+
.env
|
| 5 |
+
.env.local
|
| 6 |
+
.env.development.local
|
| 7 |
+
.env.test.local
|
| 8 |
+
.env.production.local
|
| 9 |
+
node_modules
|
| 10 |
+
dist
|
| 11 |
+
build
|
| 12 |
+
.vscode
|
| 13 |
+
.DS_Store
|
| 14 |
+
npm-debug.log*
|
| 15 |
+
yarn-debug.log*
|
| 16 |
+
yarn-error.log*
|
| 17 |
+
pnpm-debug.log*
|
| 18 |
+
coverage
|
| 19 |
+
.idea
|
| 20 |
+
*.iml
|
| 21 |
+
*.log
|
| 22 |
+
__pycache__
|
| 23 |
+
*.pyc
|
| 24 |
+
*.pyo
|
| 25 |
+
*.pyd
|
| 26 |
+
.Python
|
| 27 |
+
*.so
|
| 28 |
+
*.egg
|
| 29 |
+
*.egg-info
|
| 30 |
+
venv/
|
| 31 |
+
env/
|
| 32 |
+
ENV/
|
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-stage build for optimized Docker image
|
| 2 |
+
FROM python:3.11-slim as builder
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
build-essential \
|
| 10 |
+
curl \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Copy requirements first for better caching
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
|
| 16 |
+
# Install Python dependencies
|
| 17 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 18 |
+
pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Final stage
|
| 21 |
+
FROM python:3.11-slim
|
| 22 |
+
|
| 23 |
+
# Set working directory
|
| 24 |
+
WORKDIR /app
|
| 25 |
+
|
| 26 |
+
# Install runtime dependencies only
|
| 27 |
+
RUN apt-get update && apt-get install -y \
|
| 28 |
+
curl \
|
| 29 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
+
|
| 31 |
+
# Copy Python packages from builder
|
| 32 |
+
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
|
| 33 |
+
COPY --from=builder /usr/local/bin /usr/local/bin
|
| 34 |
+
|
| 35 |
+
# Copy application code
|
| 36 |
+
COPY src/ ./src/
|
| 37 |
+
COPY run.py .
|
| 38 |
+
COPY .env .
|
| 39 |
+
|
| 40 |
+
# Create directories for data
|
| 41 |
+
RUN mkdir -p data/pdfs data/vector_db data/processed
|
| 42 |
+
|
| 43 |
+
# Expose port
|
| 44 |
+
EXPOSE 8000
|
| 45 |
+
|
| 46 |
+
# Set environment variables
|
| 47 |
+
ENV PYTHONUNBUFFERED=1
|
| 48 |
+
ENV TOKENIZERS_PARALLELISM=false
|
| 49 |
+
ENV ANONYMIZED_TELEMETRY=false
|
| 50 |
+
|
| 51 |
+
# Health check
|
| 52 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
| 53 |
+
CMD curl -f http://localhost:8000/ || exit 1
|
| 54 |
+
|
| 55 |
+
# Run the application
|
| 56 |
+
CMD ["python", "run.py"]
|
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SOCAR Historical Document Processing Challenge
|
| 2 |
+
|
| 3 |
+
AI-powered system for processing historical handwritten and printed documents from SOCAR's oil and gas research archives.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
This solution transforms historical documents into an interactive, searchable knowledge base using:
|
| 8 |
+
- **OCR Processing** - Extract text from handwritten and printed PDFs (multi-language support)
|
| 9 |
+
- **Vector Database** - Store and retrieve document information efficiently
|
| 10 |
+
- **RAG Chatbot** - Answer questions using historical document knowledge
|
| 11 |
+
|
| 12 |
+
## Quick Start
|
| 13 |
+
|
| 14 |
+
### Option 1: Docker Deployment (Recommended)
|
| 15 |
+
|
| 16 |
+
#### Using Docker Compose
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
# Build and start the container
|
| 20 |
+
docker-compose up -d
|
| 21 |
+
|
| 22 |
+
# View logs
|
| 23 |
+
docker-compose logs -f
|
| 24 |
+
|
| 25 |
+
# Stop the container
|
| 26 |
+
docker-compose down
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
#### Using Docker Directly
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
# Build the image
|
| 33 |
+
docker build -t socar-document-processing .
|
| 34 |
+
|
| 35 |
+
# Run the container
|
| 36 |
+
docker run -d \
|
| 37 |
+
-p 8000:8000 \
|
| 38 |
+
-v $(pwd)/data:/app/data \
|
| 39 |
+
--env-file .env \
|
| 40 |
+
--name socar-api \
|
| 41 |
+
socar-document-processing
|
| 42 |
+
|
| 43 |
+
# View logs
|
| 44 |
+
docker logs -f socar-api
|
| 45 |
+
|
| 46 |
+
# Stop the container
|
| 47 |
+
docker stop socar-api
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
The API will be available at `http://localhost:8000`
|
| 51 |
+
|
| 52 |
+
### Option 2: Local Python Setup
|
| 53 |
+
|
| 54 |
+
#### 1. Install Dependencies
|
| 55 |
+
|
| 56 |
+
```bash
|
| 57 |
+
pip install -r requirements.txt
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
#### 2. Configure Environment
|
| 61 |
+
|
| 62 |
+
Ensure `.env` file exists with your credentials:
|
| 63 |
+
|
| 64 |
+
Required variables:
|
| 65 |
+
- `AZURE_OPENAI_API_KEY` - Azure OpenAI API key
|
| 66 |
+
- `AZURE_OPENAI_ENDPOINT` - Azure OpenAI endpoint URL
|
| 67 |
+
- `LLM_MODEL` - Model name (default: Llama-4-Maverick-17B-128E-Instruct-FP8)
|
| 68 |
+
|
| 69 |
+
#### 3. Run the API Server
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
python run.py
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
The API will be available at `http://localhost:8000`
|
| 76 |
+
|
| 77 |
+
#### 4. Test the System
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
python test_complete_system.py
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
## API Endpoints
|
| 84 |
+
|
| 85 |
+
### POST /ocr
|
| 86 |
+
|
| 87 |
+
Extract text from PDF documents.
|
| 88 |
+
|
| 89 |
+
**Request:**
|
| 90 |
+
```bash
|
| 91 |
+
curl -X POST http://localhost:8000/ocr \
|
| 92 |
+
-F "file=@document.pdf"
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
**Response:**
|
| 96 |
+
```json
|
| 97 |
+
[
|
| 98 |
+
{
|
| 99 |
+
"page_number": 1,
|
| 100 |
+
"MD_text": "## Section Title\nExtracted text..."
|
| 101 |
+
}
|
| 102 |
+
]
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
### POST /llm
|
| 106 |
+
|
| 107 |
+
Query documents using natural language.
|
| 108 |
+
|
| 109 |
+
**Request:**
|
| 110 |
+
```bash
|
| 111 |
+
curl -X POST http://localhost:8000/llm \
|
| 112 |
+
-H "Content-Type: application/json" \
|
| 113 |
+
-d '[{"role": "user", "content": "What is this document about?"}]'
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
**Response:**
|
| 117 |
+
```json
|
| 118 |
+
{
|
| 119 |
+
"sources": [
|
| 120 |
+
{
|
| 121 |
+
"pdf_name": "document.pdf",
|
| 122 |
+
"page_number": 1,
|
| 123 |
+
"content": "Relevant text snippet..."
|
| 124 |
+
}
|
| 125 |
+
],
|
| 126 |
+
"answer": "This document discusses..."
|
| 127 |
+
}
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## Project Structure
|
| 131 |
+
|
| 132 |
+
```
|
| 133 |
+
.
|
| 134 |
+
βββ src/
|
| 135 |
+
β βββ api/ # FastAPI endpoints
|
| 136 |
+
β βββ ocr/ # OCR processing modules
|
| 137 |
+
β βββ llm/ # LLM and RAG pipeline
|
| 138 |
+
β βββ utils/ # Utility functions
|
| 139 |
+
βββ data/
|
| 140 |
+
β βββ pdfs/ # Input PDF documents
|
| 141 |
+
β βββ processed/ # Processed documents
|
| 142 |
+
β βββ vector_db/ # Vector database storage
|
| 143 |
+
βββ tests/ # Test files
|
| 144 |
+
βββ run.py # Application entry point
|
| 145 |
+
βββ requirements.txt # Python dependencies
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
## Technologies
|
| 149 |
+
|
| 150 |
+
- **OCR**: Azure Document Intelligence (multi-language support)
|
| 151 |
+
- **Vector DB**: ChromaDB (local, open-source)
|
| 152 |
+
- **LLM**: Llama-4-Maverick-17B (open-source, deployable)
|
| 153 |
+
- **API**: FastAPI (async, high-performance)
|
| 154 |
+
- **Embeddings**: Sentence Transformers (all-MiniLM-L6-v2)
|
| 155 |
+
- **Deployment**: Docker, Docker Compose
|
| 156 |
+
|
| 157 |
+
## Deployment
|
| 158 |
+
|
| 159 |
+
### Docker Features
|
| 160 |
+
|
| 161 |
+
- **Multi-stage build** - Optimized image size
|
| 162 |
+
- **Health checks** - Automatic container monitoring
|
| 163 |
+
- **Volume mounts** - Persistent data storage
|
| 164 |
+
- **Environment variables** - Easy configuration
|
| 165 |
+
- **Auto-restart** - Production-ready resilience
|
| 166 |
+
|
| 167 |
+
### Production Deployment
|
| 168 |
+
|
| 169 |
+
```bash
|
| 170 |
+
# Build production image
|
| 171 |
+
docker build -t socar-api:production .
|
| 172 |
+
|
| 173 |
+
# Deploy with nginx reverse proxy
|
| 174 |
+
docker network create socar-network
|
| 175 |
+
docker run -d --name socar-api --network socar-network socar-api:production
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
## Development
|
| 179 |
+
|
| 180 |
+
### Running Tests
|
| 181 |
+
|
| 182 |
+
```bash
|
| 183 |
+
pytest tests/
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### Code Formatting
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
black src/
|
| 190 |
+
flake8 src/
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
## License
|
| 194 |
+
|
| 195 |
+
MIT License - SOCAR Hackathon 2024
|
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
socar-api:
|
| 5 |
+
build:
|
| 6 |
+
context: .
|
| 7 |
+
dockerfile: Dockerfile
|
| 8 |
+
container_name: socar-document-processing
|
| 9 |
+
ports:
|
| 10 |
+
- "8000:8000"
|
| 11 |
+
volumes:
|
| 12 |
+
# Mount data directories for persistence
|
| 13 |
+
- ./data/pdfs:/app/data/pdfs
|
| 14 |
+
- ./data/vector_db:/app/data/vector_db
|
| 15 |
+
- ./data/processed:/app/data/processed
|
| 16 |
+
environment:
|
| 17 |
+
# Azure OpenAI Configuration
|
| 18 |
+
- AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY}
|
| 19 |
+
- AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT}
|
| 20 |
+
- AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION}
|
| 21 |
+
# Azure Document Intelligence
|
| 22 |
+
- AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=${AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT}
|
| 23 |
+
- AZURE_DOCUMENT_INTELLIGENCE_KEY=${AZURE_DOCUMENT_INTELLIGENCE_KEY}
|
| 24 |
+
# Application Configuration
|
| 25 |
+
- LLM_MODEL=${LLM_MODEL:-Llama-4-Maverick-17B-128E-Instruct-FP8}
|
| 26 |
+
- API_HOST=0.0.0.0
|
| 27 |
+
- API_PORT=8000
|
| 28 |
+
# Performance
|
| 29 |
+
- TOKENIZERS_PARALLELISM=false
|
| 30 |
+
- ANONYMIZED_TELEMETRY=false
|
| 31 |
+
restart: unless-stopped
|
| 32 |
+
healthcheck:
|
| 33 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/"]
|
| 34 |
+
interval: 30s
|
| 35 |
+
timeout: 10s
|
| 36 |
+
retries: 3
|
| 37 |
+
start_period: 40s
|
| 38 |
+
networks:
|
| 39 |
+
- socar-network
|
| 40 |
+
|
| 41 |
+
networks:
|
| 42 |
+
socar-network:
|
| 43 |
+
driver: bridge
|
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Script to ingest all PDFs into the vector database"""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from loguru import logger
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
from src.llm.rag_pipeline import get_rag_pipeline
|
| 8 |
+
from src.ocr.processor import get_ocr_processor
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logger.remove()
|
| 12 |
+
logger.add(sys.stderr, level="INFO")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def ingest_pdfs(pdf_dir: str = "data/pdfs", limit: int = None):
|
| 16 |
+
"""
|
| 17 |
+
Ingest all PDFs from directory into vector database
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
pdf_dir: Directory containing PDF files
|
| 21 |
+
limit: Optional limit on number of PDFs to process
|
| 22 |
+
"""
|
| 23 |
+
pdf_path = Path(pdf_dir)
|
| 24 |
+
|
| 25 |
+
if not pdf_path.exists():
|
| 26 |
+
logger.error(f"PDF directory not found: {pdf_dir}")
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
# Get all PDF files
|
| 30 |
+
pdf_files = list(pdf_path.glob("*.pdf"))
|
| 31 |
+
logger.info(f"Found {len(pdf_files)} PDF files")
|
| 32 |
+
|
| 33 |
+
if limit:
|
| 34 |
+
pdf_files = pdf_files[:limit]
|
| 35 |
+
logger.info(f"Processing only first {limit} files")
|
| 36 |
+
|
| 37 |
+
# Initialize components
|
| 38 |
+
ocr = get_ocr_processor()
|
| 39 |
+
rag = get_rag_pipeline()
|
| 40 |
+
|
| 41 |
+
# Process each PDF
|
| 42 |
+
for idx, pdf_file in enumerate(pdf_files, 1):
|
| 43 |
+
try:
|
| 44 |
+
logger.info(f"[{idx}/{len(pdf_files)}] Processing: {pdf_file.name}")
|
| 45 |
+
|
| 46 |
+
# Read PDF
|
| 47 |
+
with open(pdf_file, "rb") as f:
|
| 48 |
+
pdf_content = f.read()
|
| 49 |
+
|
| 50 |
+
# Extract text with OCR
|
| 51 |
+
pages = ocr.process_pdf(pdf_content, pdf_file.name)
|
| 52 |
+
logger.info(f"Extracted {len(pages)} pages from {pdf_file.name}")
|
| 53 |
+
|
| 54 |
+
# Add to vector database
|
| 55 |
+
rag.add_processed_document(pdf_file.name, pages)
|
| 56 |
+
|
| 57 |
+
logger.info(f"Successfully ingested {pdf_file.name}")
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.error(f"Error processing {pdf_file.name}: {e}")
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
# Print stats
|
| 64 |
+
stats = rag.vector_store.get_stats()
|
| 65 |
+
logger.info(f"\nIngestion complete!")
|
| 66 |
+
logger.info(f"Total documents in vector store: {stats['total_documents']}")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
import argparse
|
| 71 |
+
|
| 72 |
+
parser = argparse.ArgumentParser(description="Ingest PDFs into vector database")
|
| 73 |
+
parser.add_argument(
|
| 74 |
+
"--dir",
|
| 75 |
+
type=str,
|
| 76 |
+
default="data/pdfs",
|
| 77 |
+
help="Directory containing PDF files",
|
| 78 |
+
)
|
| 79 |
+
parser.add_argument(
|
| 80 |
+
"--limit",
|
| 81 |
+
type=int,
|
| 82 |
+
default=None,
|
| 83 |
+
help="Limit number of PDFs to process (for testing)",
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
args = parser.parse_args()
|
| 87 |
+
ingest_pdfs(args.dir, args.limit)
|
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web Framework
|
| 2 |
+
fastapi==0.104.1
|
| 3 |
+
uvicorn[standard]==0.24.0
|
| 4 |
+
python-multipart==0.0.6
|
| 5 |
+
|
| 6 |
+
# Azure Services
|
| 7 |
+
azure-ai-formrecognizer==3.3.2
|
| 8 |
+
azure-ai-documentintelligence==1.0.0b1
|
| 9 |
+
openai==1.3.0
|
| 10 |
+
|
| 11 |
+
# OCR Libraries
|
| 12 |
+
paddleocr==2.7.3
|
| 13 |
+
easyocr==1.7.1
|
| 14 |
+
pdf2image==1.16.3
|
| 15 |
+
Pillow==10.1.0
|
| 16 |
+
pytesseract==0.3.10
|
| 17 |
+
|
| 18 |
+
# PDF Processing
|
| 19 |
+
PyPDF2==3.0.1
|
| 20 |
+
pdfplumber==0.10.3
|
| 21 |
+
pypdf==3.17.1
|
| 22 |
+
|
| 23 |
+
# Vector Database & Embeddings
|
| 24 |
+
chromadb==0.4.18
|
| 25 |
+
sentence-transformers>=2.5.0
|
| 26 |
+
faiss-cpu==1.7.4
|
| 27 |
+
|
| 28 |
+
# LLM & RAG
|
| 29 |
+
langchain==0.0.340
|
| 30 |
+
langchain-community==0.0.1
|
| 31 |
+
tiktoken==0.5.1
|
| 32 |
+
|
| 33 |
+
# Utilities
|
| 34 |
+
python-dotenv==1.0.0
|
| 35 |
+
pydantic==2.5.0
|
| 36 |
+
pydantic-settings==2.1.0
|
| 37 |
+
requests==2.31.0
|
| 38 |
+
aiofiles==23.2.1
|
| 39 |
+
|
| 40 |
+
# Monitoring & Logging
|
| 41 |
+
loguru==0.7.2
|
| 42 |
+
|
| 43 |
+
# Development
|
| 44 |
+
pytest==7.4.3
|
| 45 |
+
httpx==0.25.2
|
| 46 |
+
black==23.11.0
|
| 47 |
+
flake8==6.1.0
|
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run the FastAPI application"""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import uvicorn
|
| 5 |
+
from src.config import settings
|
| 6 |
+
|
| 7 |
+
# Disable telemetry and warnings
|
| 8 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 9 |
+
os.environ["ANONYMIZED_TELEMETRY"] = "false"
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
uvicorn.run(
|
| 13 |
+
"src.api.main:app",
|
| 14 |
+
host=settings.api_host,
|
| 15 |
+
port=settings.api_port,
|
| 16 |
+
reload=True,
|
| 17 |
+
log_level="info",
|
| 18 |
+
)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application with OCR and LLM endpoints"""
|
| 2 |
+
|
| 3 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 4 |
+
from fastapi.responses import JSONResponse, Response
|
| 5 |
+
from typing import List
|
| 6 |
+
from loguru import logger
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
from src.api.models import (
|
| 10 |
+
OCRPageResponse,
|
| 11 |
+
ChatMessage,
|
| 12 |
+
LLMResponse,
|
| 13 |
+
ErrorResponse,
|
| 14 |
+
)
|
| 15 |
+
from src.ocr.processor import get_ocr_processor
|
| 16 |
+
from src.config import settings
|
| 17 |
+
|
| 18 |
+
# Configure logging
|
| 19 |
+
logger.remove()
|
| 20 |
+
logger.add(sys.stderr, level="INFO")
|
| 21 |
+
|
| 22 |
+
# Create FastAPI app
|
| 23 |
+
app = FastAPI(
|
| 24 |
+
title="SOCAR Historical Document Processing API",
|
| 25 |
+
description="OCR and LLM endpoints for processing historical documents",
|
| 26 |
+
version="1.0.0",
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@app.get("/")
|
| 31 |
+
async def root():
|
| 32 |
+
"""Health check endpoint"""
|
| 33 |
+
return {
|
| 34 |
+
"status": "healthy",
|
| 35 |
+
"service": "SOCAR Document Processing API",
|
| 36 |
+
"endpoints": ["/ocr", "/llm"],
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@app.get("/favicon.ico", include_in_schema=False)
|
| 41 |
+
async def favicon():
|
| 42 |
+
"""Return favicon for browser tab"""
|
| 43 |
+
# Simple SVG favicon representing oil/gas industry
|
| 44 |
+
svg = """<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
| 45 |
+
<circle cx="50" cy="50" r="45" fill="#0066cc"/>
|
| 46 |
+
<path d="M30 60 L50 30 L70 60 Z" fill="#ffffff"/>
|
| 47 |
+
<rect x="45" y="55" width="10" height="30" fill="#ffffff"/>
|
| 48 |
+
</svg>"""
|
| 49 |
+
return Response(content=svg, media_type="image/svg+xml")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.post(
|
| 53 |
+
"/ocr",
|
| 54 |
+
response_model=List[OCRPageResponse],
|
| 55 |
+
responses={
|
| 56 |
+
200: {"description": "Successfully processed PDF"},
|
| 57 |
+
400: {"model": ErrorResponse, "description": "Invalid PDF file"},
|
| 58 |
+
500: {"model": ErrorResponse, "description": "Processing error"},
|
| 59 |
+
},
|
| 60 |
+
)
|
| 61 |
+
async def process_ocr(file: UploadFile = File(...)):
|
| 62 |
+
"""
|
| 63 |
+
OCR Endpoint - Extract text from PDF documents
|
| 64 |
+
|
| 65 |
+
Accepts a PDF file upload and returns the extracted Markdown text for each page.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
file: PDF file in multipart/form-data format
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
List of dictionaries with page_number and MD_text for each page
|
| 72 |
+
"""
|
| 73 |
+
try:
|
| 74 |
+
# Validate file type
|
| 75 |
+
if not file.filename.lower().endswith(".pdf"):
|
| 76 |
+
raise HTTPException(
|
| 77 |
+
status_code=400,
|
| 78 |
+
detail="Invalid file type. Only PDF files are accepted.",
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# Read file content
|
| 82 |
+
logger.info(f"Receiving PDF file: {file.filename}")
|
| 83 |
+
pdf_content = await file.read()
|
| 84 |
+
|
| 85 |
+
if len(pdf_content) == 0:
|
| 86 |
+
raise HTTPException(status_code=400, detail="Empty PDF file")
|
| 87 |
+
|
| 88 |
+
# Process PDF with OCR
|
| 89 |
+
ocr_processor = get_ocr_processor()
|
| 90 |
+
result = ocr_processor.process_pdf(pdf_content, file.filename)
|
| 91 |
+
|
| 92 |
+
# Convert to response format
|
| 93 |
+
response = [
|
| 94 |
+
OCRPageResponse(page_number=page["page_number"], MD_text=page["MD_text"])
|
| 95 |
+
for page in result
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
logger.info(f"Successfully processed {len(response)} pages from {file.filename}")
|
| 99 |
+
return response
|
| 100 |
+
|
| 101 |
+
except HTTPException:
|
| 102 |
+
raise
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.error(f"Error processing OCR request: {e}")
|
| 105 |
+
raise HTTPException(
|
| 106 |
+
status_code=500, detail=f"Failed to process PDF: {str(e)}"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@app.post(
|
| 111 |
+
"/llm",
|
| 112 |
+
response_model=LLMResponse,
|
| 113 |
+
responses={
|
| 114 |
+
200: {"description": "Successfully generated response"},
|
| 115 |
+
400: {"model": ErrorResponse, "description": "Invalid request"},
|
| 116 |
+
500: {"model": ErrorResponse, "description": "Processing error"},
|
| 117 |
+
},
|
| 118 |
+
)
|
| 119 |
+
async def process_llm(messages: List[ChatMessage]):
|
| 120 |
+
"""
|
| 121 |
+
LLM Endpoint - Generate answers from document knowledge base
|
| 122 |
+
|
| 123 |
+
Receives chat history and produces an LLM-generated answer along with source references.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
messages: List of chat messages with role and content
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
Dictionary with sources and answer
|
| 130 |
+
"""
|
| 131 |
+
try:
|
| 132 |
+
# Validate input
|
| 133 |
+
if not messages:
|
| 134 |
+
raise HTTPException(status_code=400, detail="No messages provided")
|
| 135 |
+
|
| 136 |
+
logger.info(f"Received {len(messages)} messages for LLM processing")
|
| 137 |
+
|
| 138 |
+
# Get the last user message as the query
|
| 139 |
+
last_message = messages[-1]
|
| 140 |
+
if last_message.role != "user":
|
| 141 |
+
raise HTTPException(
|
| 142 |
+
status_code=400,
|
| 143 |
+
detail="Last message must be from user",
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
query = last_message.content
|
| 147 |
+
|
| 148 |
+
# Prepare chat history (all messages except the last one)
|
| 149 |
+
chat_history = None
|
| 150 |
+
if len(messages) > 1:
|
| 151 |
+
chat_history = [
|
| 152 |
+
{"role": msg.role, "content": msg.content}
|
| 153 |
+
for msg in messages[:-1]
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
# Process query using RAG pipeline
|
| 157 |
+
from src.llm.rag_pipeline import get_rag_pipeline
|
| 158 |
+
|
| 159 |
+
rag = get_rag_pipeline()
|
| 160 |
+
result = rag.query(query, chat_history=chat_history)
|
| 161 |
+
|
| 162 |
+
logger.info(f"Generated answer with {len(result['sources'])} sources")
|
| 163 |
+
|
| 164 |
+
return LLMResponse(
|
| 165 |
+
sources=result["sources"],
|
| 166 |
+
answer=result["answer"],
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
except HTTPException:
|
| 170 |
+
raise
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.error(f"Error processing LLM request: {e}")
|
| 173 |
+
raise HTTPException(
|
| 174 |
+
status_code=500, detail=f"Failed to generate response: {str(e)}"
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
if __name__ == "__main__":
|
| 179 |
+
import uvicorn
|
| 180 |
+
|
| 181 |
+
uvicorn.run(app, host=settings.api_host, port=settings.api_port)
|
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic models for API requests and responses"""
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class OCRPageResponse(BaseModel):
|
| 8 |
+
"""Response model for a single page OCR result"""
|
| 9 |
+
|
| 10 |
+
page_number: int = Field(..., description="Page index starting from 1")
|
| 11 |
+
MD_text: str = Field(..., description="Markdown-formatted extracted text")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class OCRResponse(BaseModel):
|
| 15 |
+
"""Response model for OCR endpoint"""
|
| 16 |
+
|
| 17 |
+
pages: List[OCRPageResponse]
|
| 18 |
+
total_pages: int
|
| 19 |
+
filename: Optional[str] = None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ChatMessage(BaseModel):
|
| 23 |
+
"""Chat message model"""
|
| 24 |
+
|
| 25 |
+
role: str = Field(..., description="Role of the message sender (user/assistant)")
|
| 26 |
+
content: str = Field(..., description="Message content")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class SourceReference(BaseModel):
|
| 30 |
+
"""Source reference for LLM response"""
|
| 31 |
+
|
| 32 |
+
pdf_name: str = Field(..., description="Name of the PDF")
|
| 33 |
+
page_number: int = Field(..., description="Page number in the PDF")
|
| 34 |
+
content: str = Field(..., description="Relevant extracted text (in Markdown)")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class LLMResponse(BaseModel):
|
| 38 |
+
"""Response model for LLM endpoint"""
|
| 39 |
+
|
| 40 |
+
sources: List[SourceReference] = Field(..., description="List of source references")
|
| 41 |
+
answer: str = Field(..., description="Generated answer to the user query")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class ErrorResponse(BaseModel):
|
| 45 |
+
"""Error response model"""
|
| 46 |
+
|
| 47 |
+
error: str
|
| 48 |
+
detail: Optional[str] = None
|
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Settings(BaseSettings):
|
| 6 |
+
"""Application settings loaded from environment variables"""
|
| 7 |
+
|
| 8 |
+
# Azure OpenAI Configuration
|
| 9 |
+
azure_openai_api_key: str
|
| 10 |
+
azure_openai_endpoint: str
|
| 11 |
+
azure_openai_api_version: str = "2024-08-01-preview"
|
| 12 |
+
|
| 13 |
+
# Azure Document Intelligence
|
| 14 |
+
azure_document_intelligence_endpoint: str = ""
|
| 15 |
+
azure_document_intelligence_key: str = ""
|
| 16 |
+
|
| 17 |
+
# Application Configuration
|
| 18 |
+
data_dir: Path = Path("./data")
|
| 19 |
+
pdf_dir: Path = Path("./data/pdfs")
|
| 20 |
+
vector_db_path: Path = Path("./data/vector_db")
|
| 21 |
+
processed_dir: Path = Path("./data/processed")
|
| 22 |
+
|
| 23 |
+
# API Configuration
|
| 24 |
+
api_host: str = "0.0.0.0"
|
| 25 |
+
api_port: int = 8000
|
| 26 |
+
|
| 27 |
+
# OCR Settings
|
| 28 |
+
ocr_backend: str = "azure" # Options: azure, paddle, easy, tesseract
|
| 29 |
+
|
| 30 |
+
# LLM Settings
|
| 31 |
+
llm_model: str = "gpt-4o" # Model deployment name (gpt-4o, gpt-35-turbo, deepseek-chat, etc.)
|
| 32 |
+
|
| 33 |
+
class Config:
|
| 34 |
+
env_file = ".env"
|
| 35 |
+
case_sensitive = False
|
| 36 |
+
extra = "ignore" # Ignore extra fields in .env file
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
settings = Settings()
|
|
File without changes
|
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DeepSeek LLM client using Azure AI Foundry"""
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
+
from loguru import logger
|
| 5 |
+
import openai
|
| 6 |
+
|
| 7 |
+
from src.config import settings
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DeepSeekClient:
|
| 11 |
+
"""Client for DeepSeek LLM via Azure AI Foundry"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
"""Initialize DeepSeek client"""
|
| 15 |
+
# Configure OpenAI client to use Azure endpoint
|
| 16 |
+
self.client = openai.AzureOpenAI(
|
| 17 |
+
api_key=settings.azure_openai_api_key,
|
| 18 |
+
api_version=settings.azure_openai_api_version,
|
| 19 |
+
azure_endpoint=settings.azure_openai_endpoint,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Get model name from settings
|
| 23 |
+
self.model_name = settings.llm_model
|
| 24 |
+
logger.info(f"Initialized LLM client with model: {self.model_name}")
|
| 25 |
+
|
| 26 |
+
def generate_response(
|
| 27 |
+
self,
|
| 28 |
+
messages: List[Dict[str, str]],
|
| 29 |
+
max_tokens: int = 1000,
|
| 30 |
+
temperature: float = 0.7,
|
| 31 |
+
) -> str:
|
| 32 |
+
"""
|
| 33 |
+
Generate response from DeepSeek model
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
messages: List of message dicts with 'role' and 'content'
|
| 37 |
+
max_tokens: Maximum tokens in response
|
| 38 |
+
temperature: Sampling temperature (0.0 to 1.0)
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
Generated text response
|
| 42 |
+
"""
|
| 43 |
+
try:
|
| 44 |
+
logger.info(f"Generating response with {len(messages)} messages")
|
| 45 |
+
|
| 46 |
+
response = self.client.chat.completions.create(
|
| 47 |
+
model=self.model_name,
|
| 48 |
+
messages=messages,
|
| 49 |
+
max_tokens=max_tokens,
|
| 50 |
+
temperature=temperature,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
generated_text = response.choices[0].message.content
|
| 54 |
+
logger.info(f"Generated response: {len(generated_text)} characters")
|
| 55 |
+
|
| 56 |
+
return generated_text
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logger.error(f"Error generating response from {self.model_name}: {e}")
|
| 60 |
+
raise
|
| 61 |
+
|
| 62 |
+
def generate_with_context(
|
| 63 |
+
self,
|
| 64 |
+
query: str,
|
| 65 |
+
context_chunks: List[str],
|
| 66 |
+
chat_history: Optional[List[Dict[str, str]]] = None,
|
| 67 |
+
) -> str:
|
| 68 |
+
"""
|
| 69 |
+
Generate response with RAG context
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
query: User's question
|
| 73 |
+
context_chunks: Retrieved document chunks
|
| 74 |
+
chat_history: Previous chat messages
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Generated answer
|
| 78 |
+
"""
|
| 79 |
+
# Build context from chunks
|
| 80 |
+
context = "\n\n".join([f"[Document {i+1}]\n{chunk}" for i, chunk in enumerate(context_chunks)])
|
| 81 |
+
|
| 82 |
+
# Create system prompt optimized for LLM Judge evaluation
|
| 83 |
+
system_prompt = """You are an expert assistant specializing in SOCAR's historical oil and gas research documents.
|
| 84 |
+
|
| 85 |
+
CRITICAL INSTRUCTIONS for high-quality answers:
|
| 86 |
+
1. ACCURACY: Base your answer STRICTLY on the provided context - never add external information
|
| 87 |
+
2. RELEVANCE: Answer the exact question asked - be direct and focused
|
| 88 |
+
3. COMPLETENESS: Cover all key aspects mentioned in the context
|
| 89 |
+
4. CITATIONS: Reference specific documents (e.g., "According to Document 1...")
|
| 90 |
+
5. TECHNICAL PRECISION: Use correct oil & gas terminology from the documents
|
| 91 |
+
6. CLARITY: Structure your answer logically - use bullet points for multiple items
|
| 92 |
+
7. CONCISENESS: Be thorough but avoid redundancy or verbose explanations
|
| 93 |
+
|
| 94 |
+
If the context lacks sufficient information, clearly state what is missing."""
|
| 95 |
+
|
| 96 |
+
# Build messages
|
| 97 |
+
messages = [{"role": "system", "content": system_prompt}]
|
| 98 |
+
|
| 99 |
+
# Add chat history if provided
|
| 100 |
+
if chat_history:
|
| 101 |
+
messages.extend(chat_history)
|
| 102 |
+
|
| 103 |
+
# Add current query with context
|
| 104 |
+
user_message = f"""Context from documents:
|
| 105 |
+
{context}
|
| 106 |
+
|
| 107 |
+
Question: {query}
|
| 108 |
+
|
| 109 |
+
Provide a well-structured, accurate answer based ONLY on the context above. Include document citations."""
|
| 110 |
+
|
| 111 |
+
messages.append({"role": "user", "content": user_message})
|
| 112 |
+
|
| 113 |
+
# Optimized for quality (LLM Judge) while maintaining speed
|
| 114 |
+
return self.generate_response(messages, max_tokens=1000, temperature=0.2)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# Singleton instance
|
| 118 |
+
_deepseek_client = None
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def get_deepseek_client() -> DeepSeekClient:
|
| 122 |
+
"""Get or create DeepSeek client instance"""
|
| 123 |
+
global _deepseek_client
|
| 124 |
+
if _deepseek_client is None:
|
| 125 |
+
_deepseek_client = DeepSeekClient()
|
| 126 |
+
return _deepseek_client
|
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RAG (Retrieval Augmented Generation) pipeline"""
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
+
from loguru import logger
|
| 5 |
+
|
| 6 |
+
from src.llm.deepseek_client import get_deepseek_client
|
| 7 |
+
from src.vectordb.chroma_store import get_vector_store
|
| 8 |
+
from src.api.models import SourceReference
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class RAGPipeline:
|
| 12 |
+
"""RAG pipeline for document-based question answering"""
|
| 13 |
+
|
| 14 |
+
def __init__(self):
|
| 15 |
+
"""Initialize RAG pipeline"""
|
| 16 |
+
self.llm = get_deepseek_client()
|
| 17 |
+
self.vector_store = get_vector_store()
|
| 18 |
+
logger.info("RAG pipeline initialized")
|
| 19 |
+
|
| 20 |
+
def query(
|
| 21 |
+
self,
|
| 22 |
+
question: str,
|
| 23 |
+
chat_history: Optional[List[Dict[str, str]]] = None,
|
| 24 |
+
n_results: int = 3,
|
| 25 |
+
) -> Dict:
|
| 26 |
+
"""
|
| 27 |
+
Answer a question using RAG
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
question: User's question
|
| 31 |
+
chat_history: Previous chat messages
|
| 32 |
+
n_results: Number of documents to retrieve
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Dict with 'answer' and 'sources'
|
| 36 |
+
"""
|
| 37 |
+
logger.info(f"Processing query: {question[:100]}...")
|
| 38 |
+
|
| 39 |
+
# Step 1: Retrieve relevant documents
|
| 40 |
+
search_results = self.vector_store.search(question, n_results=n_results)
|
| 41 |
+
|
| 42 |
+
# Step 2: Format sources
|
| 43 |
+
sources = []
|
| 44 |
+
context_chunks = []
|
| 45 |
+
|
| 46 |
+
for doc, metadata in zip(search_results["documents"], search_results["metadatas"]):
|
| 47 |
+
sources.append(
|
| 48 |
+
SourceReference(
|
| 49 |
+
pdf_name=metadata.get("pdf_name", "unknown.pdf"),
|
| 50 |
+
page_number=metadata.get("page_number", 0),
|
| 51 |
+
content=doc[:500], # Limit content length
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
context_chunks.append(doc)
|
| 55 |
+
|
| 56 |
+
logger.info(f"Retrieved {len(sources)} source documents")
|
| 57 |
+
|
| 58 |
+
# Step 3: Generate answer using LLM
|
| 59 |
+
answer = self.llm.generate_with_context(
|
| 60 |
+
query=question,
|
| 61 |
+
context_chunks=context_chunks,
|
| 62 |
+
chat_history=chat_history,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
return {
|
| 66 |
+
"answer": answer,
|
| 67 |
+
"sources": sources,
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
def add_processed_document(
|
| 71 |
+
self,
|
| 72 |
+
pdf_name: str,
|
| 73 |
+
pages: List[Dict[str, any]],
|
| 74 |
+
chunk_size: int = 600,
|
| 75 |
+
chunk_overlap: int = 100,
|
| 76 |
+
):
|
| 77 |
+
"""
|
| 78 |
+
Add a processed PDF to the vector store
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
pdf_name: Name of the PDF file
|
| 82 |
+
pages: List of page dicts with page_number and MD_text
|
| 83 |
+
chunk_size: Size of text chunks in characters
|
| 84 |
+
chunk_overlap: Overlap between chunks in characters
|
| 85 |
+
"""
|
| 86 |
+
logger.info(f"Adding document to vector store: {pdf_name}")
|
| 87 |
+
|
| 88 |
+
texts = []
|
| 89 |
+
metadatas = []
|
| 90 |
+
ids = []
|
| 91 |
+
|
| 92 |
+
# Process each page
|
| 93 |
+
for page in pages:
|
| 94 |
+
page_num = page["page_number"]
|
| 95 |
+
text = page["MD_text"]
|
| 96 |
+
|
| 97 |
+
# Simple chunking by character count
|
| 98 |
+
chunks = self._chunk_text(text, chunk_size, chunk_overlap)
|
| 99 |
+
|
| 100 |
+
for chunk_idx, chunk in enumerate(chunks):
|
| 101 |
+
texts.append(chunk)
|
| 102 |
+
metadatas.append({
|
| 103 |
+
"pdf_name": pdf_name,
|
| 104 |
+
"page_number": page_num,
|
| 105 |
+
"chunk_index": chunk_idx,
|
| 106 |
+
})
|
| 107 |
+
ids.append(f"{pdf_name}_p{page_num}_c{chunk_idx}")
|
| 108 |
+
|
| 109 |
+
# Add to vector store
|
| 110 |
+
self.vector_store.add_documents(texts, metadatas, ids)
|
| 111 |
+
logger.info(f"Added {len(texts)} chunks from {pdf_name}")
|
| 112 |
+
|
| 113 |
+
def _chunk_text(
|
| 114 |
+
self, text: str, chunk_size: int, chunk_overlap: int
|
| 115 |
+
) -> List[str]:
|
| 116 |
+
"""
|
| 117 |
+
Split text into overlapping chunks
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
text: Text to chunk
|
| 121 |
+
chunk_size: Size of each chunk
|
| 122 |
+
chunk_overlap: Overlap between chunks
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
List of text chunks
|
| 126 |
+
"""
|
| 127 |
+
if not text:
|
| 128 |
+
return []
|
| 129 |
+
|
| 130 |
+
chunks = []
|
| 131 |
+
start = 0
|
| 132 |
+
|
| 133 |
+
while start < len(text):
|
| 134 |
+
end = start + chunk_size
|
| 135 |
+
chunk = text[start:end]
|
| 136 |
+
|
| 137 |
+
if chunk.strip():
|
| 138 |
+
chunks.append(chunk)
|
| 139 |
+
|
| 140 |
+
start += chunk_size - chunk_overlap
|
| 141 |
+
|
| 142 |
+
return chunks
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# Singleton instance
|
| 146 |
+
_rag_pipeline = None
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def get_rag_pipeline() -> RAGPipeline:
|
| 150 |
+
"""Get or create RAG pipeline instance"""
|
| 151 |
+
global _rag_pipeline
|
| 152 |
+
if _rag_pipeline is None:
|
| 153 |
+
_rag_pipeline = RAGPipeline()
|
| 154 |
+
return _rag_pipeline
|
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Azure Document Intelligence OCR processor"""
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import io
|
| 6 |
+
from loguru import logger
|
| 7 |
+
|
| 8 |
+
from azure.ai.formrecognizer import DocumentAnalysisClient
|
| 9 |
+
from azure.core.credentials import AzureKeyCredential
|
| 10 |
+
|
| 11 |
+
from src.config import settings
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class AzureOCRProcessor:
|
| 15 |
+
"""Process PDFs using Azure Document Intelligence"""
|
| 16 |
+
|
| 17 |
+
def __init__(self):
|
| 18 |
+
"""Initialize Azure Document Intelligence client"""
|
| 19 |
+
# Use Azure OpenAI endpoint as Document Intelligence endpoint
|
| 20 |
+
# In production, these might be different
|
| 21 |
+
endpoint = settings.azure_openai_endpoint.rstrip("/")
|
| 22 |
+
api_key = settings.azure_openai_api_key
|
| 23 |
+
|
| 24 |
+
self.client = DocumentAnalysisClient(
|
| 25 |
+
endpoint=endpoint, credential=AzureKeyCredential(api_key)
|
| 26 |
+
)
|
| 27 |
+
logger.info("Initialized Azure Document Analysis client")
|
| 28 |
+
|
| 29 |
+
def process_pdf(self, pdf_file: bytes) -> List[Dict[str, any]]:
|
| 30 |
+
"""
|
| 31 |
+
Process PDF and extract text using Azure Document Intelligence
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
pdf_file: PDF file as bytes
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
List of dicts with page_number and MD_text
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
logger.info(f"Processing PDF ({len(pdf_file)} bytes)")
|
| 41 |
+
|
| 42 |
+
# Analyze document using Azure Form Recognizer
|
| 43 |
+
poller = self.client.begin_analyze_document(
|
| 44 |
+
"prebuilt-read", document=io.BytesIO(pdf_file)
|
| 45 |
+
)
|
| 46 |
+
result = poller.result()
|
| 47 |
+
|
| 48 |
+
# Extract text page by page
|
| 49 |
+
pages_data = []
|
| 50 |
+
for page_num, page in enumerate(result.pages, start=1):
|
| 51 |
+
# Collect all lines from this page
|
| 52 |
+
lines = []
|
| 53 |
+
if hasattr(page, 'lines') and page.lines:
|
| 54 |
+
for line in page.lines:
|
| 55 |
+
lines.append(line.content)
|
| 56 |
+
|
| 57 |
+
page_text = "\n".join(lines) if lines else ""
|
| 58 |
+
|
| 59 |
+
pages_data.append({
|
| 60 |
+
"page_number": page_num,
|
| 61 |
+
"MD_text": page_text
|
| 62 |
+
})
|
| 63 |
+
|
| 64 |
+
logger.info(f"Successfully processed {len(pages_data)} pages")
|
| 65 |
+
return pages_data
|
| 66 |
+
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.error(f"Error processing PDF with Azure: {e}")
|
| 69 |
+
raise
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# Singleton instance
|
| 73 |
+
_azure_ocr_processor = None
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def get_azure_ocr_processor() -> AzureOCRProcessor:
|
| 77 |
+
"""Get or create Azure OCR processor instance"""
|
| 78 |
+
global _azure_ocr_processor
|
| 79 |
+
if _azure_ocr_processor is None:
|
| 80 |
+
_azure_ocr_processor = AzureOCRProcessor()
|
| 81 |
+
return _azure_ocr_processor
|
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Main OCR processor that handles different backends"""
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from loguru import logger
|
| 6 |
+
|
| 7 |
+
from src.config import settings
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class OCRProcessor:
|
| 11 |
+
"""Main OCR processor that can switch between different backends"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, backend: str = None):
|
| 14 |
+
"""
|
| 15 |
+
Initialize OCR processor
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
backend: OCR backend to use (azure, paddle, easy, tesseract)
|
| 19 |
+
If None, uses settings.ocr_backend
|
| 20 |
+
"""
|
| 21 |
+
self.backend = backend or settings.ocr_backend
|
| 22 |
+
logger.info(f"Initializing OCR processor with backend: {self.backend}")
|
| 23 |
+
|
| 24 |
+
# Initialize the appropriate processor
|
| 25 |
+
if self.backend == "azure":
|
| 26 |
+
from src.ocr.azure_ocr import get_azure_ocr_processor
|
| 27 |
+
self.processor = get_azure_ocr_processor()
|
| 28 |
+
else:
|
| 29 |
+
raise ValueError(f"Unsupported OCR backend: {self.backend}")
|
| 30 |
+
|
| 31 |
+
def process_pdf(self, pdf_file: bytes, filename: str = None) -> List[Dict[str, any]]:
|
| 32 |
+
"""
|
| 33 |
+
Process PDF file and extract text
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
pdf_file: PDF file as bytes
|
| 37 |
+
filename: Optional filename for logging
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
List of dicts with page_number and MD_text
|
| 41 |
+
"""
|
| 42 |
+
logger.info(f"Processing PDF: {filename or 'unnamed'} ({len(pdf_file)} bytes)")
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
result = self.processor.process_pdf(pdf_file)
|
| 46 |
+
logger.info(f"Successfully processed {len(result)} pages")
|
| 47 |
+
return result
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.error(f"Error processing PDF: {e}")
|
| 50 |
+
raise
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# Singleton instance
|
| 54 |
+
_ocr_processor = None
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_ocr_processor() -> OCRProcessor:
|
| 58 |
+
"""Get or create OCR processor instance"""
|
| 59 |
+
global _ocr_processor
|
| 60 |
+
if _ocr_processor is None:
|
| 61 |
+
_ocr_processor = OCRProcessor()
|
| 62 |
+
return _ocr_processor
|
|
File without changes
|
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ChromaDB vector store for document embeddings"""
|
| 2 |
+
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import chromadb
|
| 6 |
+
from chromadb.config import Settings
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
from loguru import logger
|
| 9 |
+
|
| 10 |
+
from src.config import settings as app_settings
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ChromaVectorStore:
|
| 14 |
+
"""Vector store using ChromaDB"""
|
| 15 |
+
|
| 16 |
+
def __init__(self, collection_name: str = "socar_documents"):
|
| 17 |
+
"""
|
| 18 |
+
Initialize ChromaDB vector store
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
collection_name: Name of the collection to use
|
| 22 |
+
"""
|
| 23 |
+
# Initialize ChromaDB client
|
| 24 |
+
self.db_path = app_settings.vector_db_path
|
| 25 |
+
self.db_path.mkdir(parents=True, exist_ok=True)
|
| 26 |
+
|
| 27 |
+
self.client = chromadb.PersistentClient(
|
| 28 |
+
path=str(self.db_path),
|
| 29 |
+
settings=Settings(
|
| 30 |
+
anonymized_telemetry=False,
|
| 31 |
+
allow_reset=True,
|
| 32 |
+
),
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Initialize embedding model
|
| 36 |
+
logger.info("Loading embedding model...")
|
| 37 |
+
self.embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 38 |
+
logger.info("Embedding model loaded")
|
| 39 |
+
|
| 40 |
+
# Get or create collection
|
| 41 |
+
self.collection = self.client.get_or_create_collection(
|
| 42 |
+
name=collection_name,
|
| 43 |
+
metadata={"description": "SOCAR historical documents"},
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
logger.info(f"ChromaDB initialized with collection: {collection_name}")
|
| 47 |
+
logger.info(f"Collection contains {self.collection.count()} documents")
|
| 48 |
+
|
| 49 |
+
def add_documents(
|
| 50 |
+
self,
|
| 51 |
+
texts: List[str],
|
| 52 |
+
metadatas: List[Dict],
|
| 53 |
+
ids: Optional[List[str]] = None,
|
| 54 |
+
):
|
| 55 |
+
"""
|
| 56 |
+
Add documents to the vector store
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
texts: List of text chunks to add
|
| 60 |
+
metadatas: List of metadata dicts (pdf_name, page_number, etc.)
|
| 61 |
+
ids: Optional list of document IDs
|
| 62 |
+
"""
|
| 63 |
+
if not texts:
|
| 64 |
+
logger.warning("No texts provided to add")
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
# Generate IDs if not provided
|
| 68 |
+
if ids is None:
|
| 69 |
+
ids = [f"doc_{i}" for i in range(len(texts))]
|
| 70 |
+
|
| 71 |
+
logger.info(f"Adding {len(texts)} documents to vector store")
|
| 72 |
+
|
| 73 |
+
# Generate embeddings
|
| 74 |
+
embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
|
| 75 |
+
|
| 76 |
+
# Add to ChromaDB
|
| 77 |
+
self.collection.add(
|
| 78 |
+
documents=texts,
|
| 79 |
+
embeddings=embeddings.tolist(),
|
| 80 |
+
metadatas=metadatas,
|
| 81 |
+
ids=ids,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
logger.info(f"Successfully added {len(texts)} documents")
|
| 85 |
+
|
| 86 |
+
def search(
|
| 87 |
+
self,
|
| 88 |
+
query: str,
|
| 89 |
+
n_results: int = 5,
|
| 90 |
+
filter_metadata: Optional[Dict] = None,
|
| 91 |
+
) -> Dict:
|
| 92 |
+
"""
|
| 93 |
+
Search for similar documents
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
query: Search query
|
| 97 |
+
n_results: Number of results to return
|
| 98 |
+
filter_metadata: Optional metadata filter
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Dict with documents, metadatas, and distances
|
| 102 |
+
"""
|
| 103 |
+
logger.info(f"Searching for: {query[:100]}...")
|
| 104 |
+
|
| 105 |
+
# Generate query embedding
|
| 106 |
+
query_embedding = self.embedding_model.encode([query])[0]
|
| 107 |
+
|
| 108 |
+
# Search ChromaDB
|
| 109 |
+
results = self.collection.query(
|
| 110 |
+
query_embeddings=[query_embedding.tolist()],
|
| 111 |
+
n_results=n_results,
|
| 112 |
+
where=filter_metadata,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
logger.info(f"Found {len(results['documents'][0])} results")
|
| 116 |
+
|
| 117 |
+
return {
|
| 118 |
+
"documents": results["documents"][0],
|
| 119 |
+
"metadatas": results["metadatas"][0],
|
| 120 |
+
"distances": results["distances"][0],
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
def clear(self):
|
| 124 |
+
"""Clear all documents from the collection"""
|
| 125 |
+
logger.warning("Clearing all documents from collection")
|
| 126 |
+
self.client.delete_collection(self.collection.name)
|
| 127 |
+
self.collection = self.client.create_collection(
|
| 128 |
+
name=self.collection.name,
|
| 129 |
+
metadata={"description": "SOCAR historical documents"},
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
def get_stats(self) -> Dict:
|
| 133 |
+
"""Get collection statistics"""
|
| 134 |
+
return {
|
| 135 |
+
"total_documents": self.collection.count(),
|
| 136 |
+
"collection_name": self.collection.name,
|
| 137 |
+
"db_path": str(self.db_path),
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# Singleton instance
|
| 142 |
+
_vector_store = None
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def get_vector_store() -> ChromaVectorStore:
|
| 146 |
+
"""Get or create vector store instance"""
|
| 147 |
+
global _vector_store
|
| 148 |
+
if _vector_store is None:
|
| 149 |
+
_vector_store = ChromaVectorStore()
|
| 150 |
+
return _vector_store
|
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# SOCAR Document Processing - Quick Start Script
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
|
| 7 |
+
echo "=================================="
|
| 8 |
+
echo "SOCAR Document Processing System"
|
| 9 |
+
echo "=================================="
|
| 10 |
+
echo ""
|
| 11 |
+
|
| 12 |
+
# Check if .env exists
|
| 13 |
+
if [ ! -f .env ]; then
|
| 14 |
+
echo "β Error: .env file not found"
|
| 15 |
+
echo "Please create .env file with required credentials"
|
| 16 |
+
exit 1
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
# Check if Docker is installed
|
| 20 |
+
if ! command -v docker &> /dev/null; then
|
| 21 |
+
echo "β Error: Docker is not installed"
|
| 22 |
+
echo "Please install Docker: https://docs.docker.com/get-docker/"
|
| 23 |
+
exit 1
|
| 24 |
+
fi
|
| 25 |
+
|
| 26 |
+
# Check if Docker Compose is installed
|
| 27 |
+
if ! command -v docker-compose &> /dev/null; then
|
| 28 |
+
echo "β Error: Docker Compose is not installed"
|
| 29 |
+
echo "Please install Docker Compose: https://docs.docker.com/compose/install/"
|
| 30 |
+
exit 1
|
| 31 |
+
fi
|
| 32 |
+
|
| 33 |
+
echo "β Prerequisites checked"
|
| 34 |
+
echo ""
|
| 35 |
+
|
| 36 |
+
# Create data directories
|
| 37 |
+
mkdir -p data/pdfs data/vector_db data/processed
|
| 38 |
+
echo "β Data directories created"
|
| 39 |
+
echo ""
|
| 40 |
+
|
| 41 |
+
# Build and start containers
|
| 42 |
+
echo "π¨ Building Docker image..."
|
| 43 |
+
docker-compose build
|
| 44 |
+
|
| 45 |
+
echo ""
|
| 46 |
+
echo "π Starting containers..."
|
| 47 |
+
docker-compose up -d
|
| 48 |
+
|
| 49 |
+
echo ""
|
| 50 |
+
echo "β³ Waiting for service to be ready..."
|
| 51 |
+
sleep 5
|
| 52 |
+
|
| 53 |
+
# Wait for health check
|
| 54 |
+
MAX_RETRIES=30
|
| 55 |
+
RETRY_COUNT=0
|
| 56 |
+
until curl -f http://localhost:8000/ &> /dev/null || [ $RETRY_COUNT -eq $MAX_RETRIES ]; do
|
| 57 |
+
echo " Waiting for API... ($RETRY_COUNT/$MAX_RETRIES)"
|
| 58 |
+
sleep 2
|
| 59 |
+
((RETRY_COUNT++))
|
| 60 |
+
done
|
| 61 |
+
|
| 62 |
+
if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then
|
| 63 |
+
echo ""
|
| 64 |
+
echo "β Failed to start service"
|
| 65 |
+
echo "Check logs with: docker-compose logs"
|
| 66 |
+
exit 1
|
| 67 |
+
fi
|
| 68 |
+
|
| 69 |
+
echo ""
|
| 70 |
+
echo "=================================="
|
| 71 |
+
echo "β
SOCAR API is ready!"
|
| 72 |
+
echo "=================================="
|
| 73 |
+
echo ""
|
| 74 |
+
echo "π API URL: http://localhost:8000"
|
| 75 |
+
echo "π Documentation: http://localhost:8000/docs"
|
| 76 |
+
echo ""
|
| 77 |
+
echo "Useful commands:"
|
| 78 |
+
echo " β’ View logs: docker-compose logs -f"
|
| 79 |
+
echo " β’ Stop: docker-compose down"
|
| 80 |
+
echo " β’ Restart: docker-compose restart"
|
| 81 |
+
echo ""
|
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Complete system test"""
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
API_URL = "http://localhost:8000"
|
| 8 |
+
|
| 9 |
+
def test_health():
|
| 10 |
+
"""Test API health"""
|
| 11 |
+
print("=" * 60)
|
| 12 |
+
print("1. Testing API Health")
|
| 13 |
+
print("=" * 60)
|
| 14 |
+
response = requests.get(f"{API_URL}/")
|
| 15 |
+
print(f"Status: {response.status_code}")
|
| 16 |
+
print(json.dumps(response.json(), indent=2))
|
| 17 |
+
return response.status_code == 200
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_ocr():
|
| 21 |
+
"""Test OCR endpoint"""
|
| 22 |
+
print("\n" + "=" * 60)
|
| 23 |
+
print("2. Testing OCR Endpoint")
|
| 24 |
+
print("=" * 60)
|
| 25 |
+
|
| 26 |
+
pdf_path = Path("data/pdfs/document_00.pdf")
|
| 27 |
+
if not pdf_path.exists():
|
| 28 |
+
print("β PDF not found")
|
| 29 |
+
return False
|
| 30 |
+
|
| 31 |
+
with open(pdf_path, "rb") as f:
|
| 32 |
+
files = {"file": (pdf_path.name, f, "application/pdf")}
|
| 33 |
+
response = requests.post(f"{API_URL}/ocr", files=files)
|
| 34 |
+
|
| 35 |
+
if response.status_code == 200:
|
| 36 |
+
result = response.json()
|
| 37 |
+
print(f"β Successfully processed {len(result)} pages")
|
| 38 |
+
print(f" First page preview: {result[0]['MD_text'][:100]}...")
|
| 39 |
+
return True
|
| 40 |
+
else:
|
| 41 |
+
print(f"β Error: {response.status_code}")
|
| 42 |
+
return False
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_llm():
|
| 46 |
+
"""Test LLM endpoint"""
|
| 47 |
+
print("\n" + "=" * 60)
|
| 48 |
+
print("3. Testing LLM Endpoint (RAG)")
|
| 49 |
+
print("=" * 60)
|
| 50 |
+
|
| 51 |
+
messages = [
|
| 52 |
+
{"role": "user", "content": "What geological formations are discussed?"}
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
response = requests.post(
|
| 56 |
+
f"{API_URL}/llm",
|
| 57 |
+
json=messages,
|
| 58 |
+
headers={"Content-Type": "application/json"}
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
if response.status_code == 200:
|
| 62 |
+
result = response.json()
|
| 63 |
+
print(f"β Generated answer with {len(result['sources'])} sources")
|
| 64 |
+
print(f"\nAnswer preview:")
|
| 65 |
+
print(result["answer"][:300] + "...")
|
| 66 |
+
print(f"\nSources:")
|
| 67 |
+
for i, src in enumerate(result["sources"][:3], 1):
|
| 68 |
+
print(f" [{i}] {src['pdf_name']} - Page {src['page_number']}")
|
| 69 |
+
return True
|
| 70 |
+
else:
|
| 71 |
+
print(f"β Error: {response.status_code}")
|
| 72 |
+
return False
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def test_llm_with_history():
|
| 76 |
+
"""Test LLM with chat history"""
|
| 77 |
+
print("\n" + "=" * 60)
|
| 78 |
+
print("4. Testing LLM with Chat History")
|
| 79 |
+
print("=" * 60)
|
| 80 |
+
|
| 81 |
+
messages = [
|
| 82 |
+
{"role": "user", "content": "What is the South Caspian Basin?"},
|
| 83 |
+
{"role": "assistant", "content": "The South Caspian Basin is a sedimentary basin..."},
|
| 84 |
+
{"role": "user", "content": "Tell me more about its hydrocarbon potential."}
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
response = requests.post(
|
| 88 |
+
f"{API_URL}/llm",
|
| 89 |
+
json=messages,
|
| 90 |
+
headers={"Content-Type": "application/json"}
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
if response.status_code == 200:
|
| 94 |
+
result = response.json()
|
| 95 |
+
print(f"β Generated contextual answer with chat history")
|
| 96 |
+
print(f" Answer length: {len(result['answer'])} characters")
|
| 97 |
+
print(f" Sources: {len(result['sources'])} documents")
|
| 98 |
+
return True
|
| 99 |
+
else:
|
| 100 |
+
print(f"β Error: {response.status_code}")
|
| 101 |
+
return False
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
print("\n" + "π" * 30)
|
| 106 |
+
print("SOCAR Document Processing System - Complete Test")
|
| 107 |
+
print("π" * 30 + "\n")
|
| 108 |
+
|
| 109 |
+
results = []
|
| 110 |
+
results.append(("Health Check", test_health()))
|
| 111 |
+
results.append(("OCR Endpoint", test_ocr()))
|
| 112 |
+
results.append(("LLM Endpoint", test_llm()))
|
| 113 |
+
results.append(("LLM Chat History", test_llm_with_history()))
|
| 114 |
+
|
| 115 |
+
print("\n" + "=" * 60)
|
| 116 |
+
print("TEST SUMMARY")
|
| 117 |
+
print("=" * 60)
|
| 118 |
+
for name, passed in results:
|
| 119 |
+
status = "β PASS" if passed else "β FAIL"
|
| 120 |
+
print(f"{status:10} - {name}")
|
| 121 |
+
|
| 122 |
+
all_passed = all(r[1] for r in results)
|
| 123 |
+
print("\n" + ("π" if all_passed else "β") * 30)
|
| 124 |
+
if all_passed:
|
| 125 |
+
print("ALL TESTS PASSED - System Ready for Hackathon!")
|
| 126 |
+
else:
|
| 127 |
+
print("Some tests failed - please review")
|
| 128 |
+
print(("π" if all_passed else "β") * 30 + "\n")
|