Spaces:
Sleeping
Sleeping
Sahil Garg commited on
Commit Β·
67dfd9a
1
Parent(s): 6968c5c
modularization and configuration
Browse files- .env.example +6 -0
- .gitignore +1 -0
- README.md +0 -0
- api/routes.py +252 -0
- api/schemas.py +104 -0
- app.py +32 -115
- boq_processor.py +0 -477
- config/settings.py +109 -0
- core/embeddings.py +118 -0
- core/llm.py +78 -0
- core/pdf_extractor.py +101 -0
- core/rag_chain.py +80 -0
- prompts/get_prompts.py +17 -0
- prompts/templates.yaml +71 -0
- requirements.txt +51 -15
- services/boq_extractor.py +296 -0
- services/consistency.py +161 -0
- streamlit_app.py +286 -119
.env.example
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BOQTenders Environment Configuration
|
| 2 |
+
# Copy this file to .env and fill in your API keys
|
| 3 |
+
|
| 4 |
+
# Required API Keys
|
| 5 |
+
GOOGLE_API_KEY=your_google_api_key_here
|
| 6 |
+
HF_API_TOKEN=your_huggingface_api_token_here
|
.gitignore
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
__pycache__/
|
|
|
|
| 2 |
*.pyc
|
| 3 |
*.pyo
|
| 4 |
*.pyd
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
+
**/__pycache__/
|
| 3 |
*.pyc
|
| 4 |
*.pyo
|
| 5 |
*.pyd
|
README.md
CHANGED
|
Binary files a/README.md and b/README.md differ
|
|
|
api/routes.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI routes for BOQ extraction API.
|
| 3 |
+
"""
|
| 4 |
+
import tempfile
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
+
from loguru import logger
|
| 11 |
+
|
| 12 |
+
from config.settings import settings
|
| 13 |
+
from core.pdf_extractor import PDFExtractor
|
| 14 |
+
from core.embeddings import EmbeddingService
|
| 15 |
+
from core.rag_chain import RAGChainBuilder
|
| 16 |
+
from services.boq_extractor import BOQExtractor
|
| 17 |
+
from services.consistency import ConsistencyChecker
|
| 18 |
+
from api.schemas import (
|
| 19 |
+
ChatRequest,
|
| 20 |
+
ChatResponse,
|
| 21 |
+
UploadResponse,
|
| 22 |
+
ConsistencyResponse,
|
| 23 |
+
ErrorResponse,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Initialize FastAPI app
|
| 28 |
+
app = FastAPI(
|
| 29 |
+
title=settings.api.title,
|
| 30 |
+
description=settings.api.description,
|
| 31 |
+
version=settings.api.version,
|
| 32 |
+
docs_url="/docs" if settings.api.docs_enabled else None,
|
| 33 |
+
redoc_url="/redoc" if settings.api.docs_enabled else None,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Configure CORS
|
| 37 |
+
app.add_middleware(
|
| 38 |
+
CORSMiddleware,
|
| 39 |
+
allow_origins=settings.api.cors_origins,
|
| 40 |
+
allow_credentials=True,
|
| 41 |
+
allow_methods=["*"],
|
| 42 |
+
allow_headers=["*"],
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Global state for session data
|
| 46 |
+
_session_state = {
|
| 47 |
+
"qa_chain": None,
|
| 48 |
+
"vector_store": None,
|
| 49 |
+
"chunks": None,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# Initialize services
|
| 53 |
+
pdf_extractor = PDFExtractor()
|
| 54 |
+
embedding_service = EmbeddingService()
|
| 55 |
+
rag_builder = RAGChainBuilder()
|
| 56 |
+
boq_extractor = BOQExtractor()
|
| 57 |
+
consistency_checker = ConsistencyChecker(boq_extractor=boq_extractor)
|
| 58 |
+
|
| 59 |
+
# Expose router for external use
|
| 60 |
+
router = app.router
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@app.post(
|
| 64 |
+
"/upload",
|
| 65 |
+
response_model=UploadResponse,
|
| 66 |
+
responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}},
|
| 67 |
+
tags=["Documents"]
|
| 68 |
+
)
|
| 69 |
+
async def upload_pdf(file: UploadFile = File(...)):
|
| 70 |
+
"""
|
| 71 |
+
Upload a PDF file for BOQ extraction.
|
| 72 |
+
|
| 73 |
+
- Extracts text from PDF
|
| 74 |
+
- Creates embeddings and vector store
|
| 75 |
+
- Extracts BOQ items
|
| 76 |
+
- Sets up QA chain for chat
|
| 77 |
+
"""
|
| 78 |
+
global _session_state
|
| 79 |
+
|
| 80 |
+
if not file:
|
| 81 |
+
raise HTTPException(status_code=400, detail="No file uploaded")
|
| 82 |
+
|
| 83 |
+
if not file.filename.lower().endswith('.pdf'):
|
| 84 |
+
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
logger.info(f'Processing uploaded file: {file.filename}')
|
| 88 |
+
|
| 89 |
+
# Save to temp file
|
| 90 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 91 |
+
content = await file.read()
|
| 92 |
+
temp_file.write(content)
|
| 93 |
+
temp_path = temp_file.name
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
# Extract text from PDF
|
| 97 |
+
logger.info('Extracting text from PDF...')
|
| 98 |
+
text = pdf_extractor.extract_text(temp_path, filename=file.filename)
|
| 99 |
+
|
| 100 |
+
if not text:
|
| 101 |
+
raise HTTPException(
|
| 102 |
+
status_code=400,
|
| 103 |
+
detail="Could not extract text from PDF"
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Create chunks and vector store
|
| 107 |
+
logger.info('Creating embeddings...')
|
| 108 |
+
chunks = embedding_service.split_text(text)
|
| 109 |
+
vector_store = embedding_service.create_vector_store(chunks)
|
| 110 |
+
|
| 111 |
+
# Extract BOQ
|
| 112 |
+
logger.info('Extracting BOQ...')
|
| 113 |
+
boq_output = boq_extractor.extract(chunks, vector_store)
|
| 114 |
+
|
| 115 |
+
# Build QA chain
|
| 116 |
+
logger.info('Building QA chain...')
|
| 117 |
+
qa_chain = rag_builder.build(vector_store)
|
| 118 |
+
|
| 119 |
+
# Store in session state
|
| 120 |
+
_session_state["qa_chain"] = qa_chain
|
| 121 |
+
_session_state["vector_store"] = vector_store
|
| 122 |
+
_session_state["chunks"] = chunks
|
| 123 |
+
|
| 124 |
+
logger.info(f'Upload completed: {len(chunks)} chunks created')
|
| 125 |
+
|
| 126 |
+
return UploadResponse(
|
| 127 |
+
message="success",
|
| 128 |
+
output=boq_output
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
finally:
|
| 132 |
+
# Clean up temp file
|
| 133 |
+
Path(temp_path).unlink(missing_ok=True)
|
| 134 |
+
|
| 135 |
+
except HTTPException:
|
| 136 |
+
raise
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.error(f'Error processing upload: {e}')
|
| 139 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@app.post(
|
| 143 |
+
"/chat",
|
| 144 |
+
response_model=ChatResponse,
|
| 145 |
+
responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}},
|
| 146 |
+
tags=["Chat"]
|
| 147 |
+
)
|
| 148 |
+
async def chat(request: ChatRequest):
|
| 149 |
+
"""
|
| 150 |
+
Ask a question about the uploaded document.
|
| 151 |
+
|
| 152 |
+
Requires a document to be uploaded first via /upload endpoint.
|
| 153 |
+
"""
|
| 154 |
+
global _session_state
|
| 155 |
+
|
| 156 |
+
if not _session_state.get("qa_chain"):
|
| 157 |
+
raise HTTPException(
|
| 158 |
+
status_code=400,
|
| 159 |
+
detail="No document loaded. Please upload a PDF first."
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
try:
|
| 163 |
+
logger.info(f'Processing chat question: {request.question[:50]}...')
|
| 164 |
+
|
| 165 |
+
qa_chain = _session_state["qa_chain"]
|
| 166 |
+
|
| 167 |
+
# Get response from QA chain (using old LangChain API)
|
| 168 |
+
response = qa_chain({"question": request.question})
|
| 169 |
+
|
| 170 |
+
answer = response.get("answer", "")
|
| 171 |
+
|
| 172 |
+
logger.info('Chat response generated')
|
| 173 |
+
|
| 174 |
+
return ChatResponse(answer=answer)
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
logger.error(f'Error processing chat: {e}')
|
| 178 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
@app.post(
|
| 182 |
+
"/consistency",
|
| 183 |
+
response_model=ConsistencyResponse,
|
| 184 |
+
responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}},
|
| 185 |
+
tags=["Analysis"]
|
| 186 |
+
)
|
| 187 |
+
async def check_consistency(runs: int = 4):
|
| 188 |
+
"""
|
| 189 |
+
Check extraction consistency by running multiple extractions.
|
| 190 |
+
|
| 191 |
+
Requires a document to be uploaded first via /upload endpoint.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
runs: Number of extraction runs (default: 4)
|
| 195 |
+
"""
|
| 196 |
+
global _session_state
|
| 197 |
+
|
| 198 |
+
if not _session_state.get("chunks"):
|
| 199 |
+
raise HTTPException(
|
| 200 |
+
status_code=400,
|
| 201 |
+
detail="No document loaded. Please upload a PDF first."
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
if runs < 2:
|
| 205 |
+
raise HTTPException(
|
| 206 |
+
status_code=400,
|
| 207 |
+
detail="At least 2 runs required for consistency check"
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
if runs > 10:
|
| 211 |
+
raise HTTPException(
|
| 212 |
+
status_code=400,
|
| 213 |
+
detail="Maximum 10 runs allowed"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
try:
|
| 217 |
+
logger.info(f'Running consistency check with {runs} runs')
|
| 218 |
+
|
| 219 |
+
result = consistency_checker.check(
|
| 220 |
+
chunks=_session_state["chunks"],
|
| 221 |
+
vector_store=_session_state["vector_store"],
|
| 222 |
+
runs=runs
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
return ConsistencyResponse(
|
| 226 |
+
consistency_score=result.get("consistency_score"),
|
| 227 |
+
successful_runs=result.get("successful_runs"),
|
| 228 |
+
avg_confidence=result.get("avg_confidence"),
|
| 229 |
+
is_low_consistency=result.get("is_low_consistency")
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
logger.error(f'Error in consistency check: {e}')
|
| 234 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
@app.get(
|
| 238 |
+
"/clear",
|
| 239 |
+
tags=["Session"]
|
| 240 |
+
)
|
| 241 |
+
async def clear_session():
|
| 242 |
+
"""Clear the current session state."""
|
| 243 |
+
global _session_state
|
| 244 |
+
|
| 245 |
+
_session_state = {
|
| 246 |
+
"qa_chain": None,
|
| 247 |
+
"vector_store": None,
|
| 248 |
+
"chunks": None,
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
logger.info('Session cleared')
|
| 252 |
+
return {"message": "Session cleared"}
|
api/schemas.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic schemas for API request/response validation.
|
| 3 |
+
"""
|
| 4 |
+
from typing import Optional, List, Dict, Any
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ChatRequest(BaseModel):
|
| 9 |
+
"""Chat endpoint request schema."""
|
| 10 |
+
question: str = Field(..., description="Question to ask about the document")
|
| 11 |
+
|
| 12 |
+
class Config:
|
| 13 |
+
json_schema_extra = {
|
| 14 |
+
"example": {
|
| 15 |
+
"question": "What is the total quantity of steel required?"
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ChatResponse(BaseModel):
|
| 21 |
+
"""Chat endpoint response schema."""
|
| 22 |
+
answer: str = Field(..., description="Answer to the question")
|
| 23 |
+
|
| 24 |
+
class Config:
|
| 25 |
+
json_schema_extra = {
|
| 26 |
+
"example": {
|
| 27 |
+
"answer": "The total quantity of steel required is 500 MT."
|
| 28 |
+
}
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class UploadResponse(BaseModel):
|
| 33 |
+
"""Upload endpoint response schema."""
|
| 34 |
+
message: str = Field(..., description="Status message")
|
| 35 |
+
output: str = Field(..., description="Extracted BOQ in markdown format")
|
| 36 |
+
|
| 37 |
+
class Config:
|
| 38 |
+
json_schema_extra = {
|
| 39 |
+
"example": {
|
| 40 |
+
"message": "success",
|
| 41 |
+
"output": "## DOCUMENT SUMMARY\n..."
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class ConsistencyResponse(BaseModel):
|
| 47 |
+
"""Consistency check endpoint response schema."""
|
| 48 |
+
consistency_score: float = Field(
|
| 49 |
+
...,
|
| 50 |
+
ge=0,
|
| 51 |
+
le=100,
|
| 52 |
+
description="Consistency score as percentage (0-100)"
|
| 53 |
+
)
|
| 54 |
+
successful_runs: int = Field(..., description="Number of successful runs")
|
| 55 |
+
avg_confidence: float = Field(
|
| 56 |
+
...,
|
| 57 |
+
ge=0,
|
| 58 |
+
le=100,
|
| 59 |
+
description="Average confidence score"
|
| 60 |
+
)
|
| 61 |
+
is_low_consistency: bool = Field(
|
| 62 |
+
...,
|
| 63 |
+
description="Whether consistency is below threshold"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
class Config:
|
| 67 |
+
json_schema_extra = {
|
| 68 |
+
"example": {
|
| 69 |
+
"consistency_score": 92.5,
|
| 70 |
+
"successful_runs": 4,
|
| 71 |
+
"avg_confidence": 85.2,
|
| 72 |
+
"is_low_consistency": False
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class BOQResponse(BaseModel):
|
| 78 |
+
"""BOQ extraction response schema."""
|
| 79 |
+
boq_output: str = Field(..., description="Extracted BOQ in markdown format")
|
| 80 |
+
items_count: int = Field(default=0, description="Number of BOQ items extracted")
|
| 81 |
+
|
| 82 |
+
class Config:
|
| 83 |
+
json_schema_extra = {
|
| 84 |
+
"example": {
|
| 85 |
+
"boq_output": "## DOCUMENT SUMMARY\n...",
|
| 86 |
+
"items_count": 25
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class ErrorResponse(BaseModel):
|
| 92 |
+
"""Error response schema."""
|
| 93 |
+
error: str = Field(..., description="Error type")
|
| 94 |
+
message: str = Field(..., description="Error message")
|
| 95 |
+
detail: Optional[str] = Field(default=None, description="Additional error details")
|
| 96 |
+
|
| 97 |
+
class Config:
|
| 98 |
+
json_schema_extra = {
|
| 99 |
+
"example": {
|
| 100 |
+
"error": "ValidationError",
|
| 101 |
+
"message": "No file uploaded",
|
| 102 |
+
"detail": "Please upload a PDF file"
|
| 103 |
+
}
|
| 104 |
+
}
|
app.py
CHANGED
|
@@ -1,122 +1,39 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
import boq_processor
|
| 4 |
-
from loguru import logger
|
| 5 |
-
import os
|
| 6 |
-
import shutil
|
| 7 |
-
from typing import Dict, Any
|
| 8 |
|
| 9 |
-
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
global chunks
|
| 34 |
-
chunks = boq_processor.load_and_process_pdf(pdf_path, filename=file.filename)
|
| 35 |
-
vector_store = boq_processor.create_vector_store(chunks)
|
| 36 |
-
qa_chain = boq_processor.setup_rag_chain(vector_store)
|
| 37 |
-
|
| 38 |
-
# Use comprehensive extraction for complete BOQ coverage
|
| 39 |
-
extracted_boq = boq_processor.extract_boq_comprehensive(chunks, vector_store)
|
| 40 |
-
|
| 41 |
-
logger.info("PDF uploaded and processed successfully")
|
| 42 |
-
|
| 43 |
-
# Return enriched response with processing metadata
|
| 44 |
-
return {
|
| 45 |
-
"status": "success",
|
| 46 |
-
"message": "PDF uploaded and processed successfully",
|
| 47 |
-
"file_name": file.filename,
|
| 48 |
-
"processing_info": {
|
| 49 |
-
"documents_loaded": len(chunks),
|
| 50 |
-
"chunks_created": len(chunks),
|
| 51 |
-
"vector_store_ready": vector_store is not None,
|
| 52 |
-
"rag_chain_ready": qa_chain is not None
|
| 53 |
-
},
|
| 54 |
-
"extracted_boq": extracted_boq
|
| 55 |
-
}
|
| 56 |
-
except Exception as e:
|
| 57 |
-
error_msg = str(e)
|
| 58 |
-
logger.error(f"Error processing PDF: {error_msg}")
|
| 59 |
-
|
| 60 |
-
# Check if it's a rate limit error and provide specific guidance
|
| 61 |
-
if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg or "quota" in error_msg.lower():
|
| 62 |
-
raise HTTPException(
|
| 63 |
-
status_code=429,
|
| 64 |
-
detail="API rate limit exceeded. Please try again later or upgrade your API plan. See https://ai.google.dev/gemini-api/docs/rate-limits"
|
| 65 |
-
)
|
| 66 |
-
else:
|
| 67 |
-
raise HTTPException(status_code=500, detail="Internal server error")
|
| 68 |
-
finally:
|
| 69 |
-
if os.path.exists(pdf_path):
|
| 70 |
-
os.remove(pdf_path)
|
| 71 |
|
| 72 |
-
@app.post("/chat", summary="Chat with BOQ", description="Send a question about the uploaded BOQ PDF and get an answer.")
|
| 73 |
-
async def chat(request: ChatRequest) -> Dict[str, str]:
|
| 74 |
-
if not qa_chain:
|
| 75 |
-
logger.warning("Chat attempted without uploaded PDF")
|
| 76 |
-
raise HTTPException(status_code=400, detail="No PDF uploaded. Please upload a PDF first using /upload")
|
| 77 |
-
|
| 78 |
-
try:
|
| 79 |
-
logger.info(f"Processing chat question: {request.question}")
|
| 80 |
-
|
| 81 |
-
# Use old LangChain API (0.1.x) directly
|
| 82 |
-
result = qa_chain({"question": request.question})
|
| 83 |
-
|
| 84 |
-
logger.info("Chat response generated")
|
| 85 |
-
return {"answer": result["answer"]}
|
| 86 |
-
except Exception as e:
|
| 87 |
-
error_msg = str(e)
|
| 88 |
-
logger.error(f"Error in chat: {error_msg}")
|
| 89 |
-
|
| 90 |
-
# Check if it's a rate limit error and provide specific guidance
|
| 91 |
-
if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg or "quota" in error_msg.lower():
|
| 92 |
-
raise HTTPException(
|
| 93 |
-
status_code=429,
|
| 94 |
-
detail="API rate limit exceeded. Please try again later or upgrade your API plan. See https://ai.google.dev/gemini-api/docs/rate-limits"
|
| 95 |
-
)
|
| 96 |
-
else:
|
| 97 |
-
raise HTTPException(status_code=500, detail="Internal server error")
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
global chunks, vector_store
|
| 102 |
-
if not chunks or not vector_store:
|
| 103 |
-
logger.warning("Consistency check attempted without uploaded PDF")
|
| 104 |
-
raise HTTPException(status_code=400, detail="No PDF uploaded. Please upload a PDF first using /upload")
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
error_msg = str(e)
|
| 113 |
-
logger.error(f"Error in consistency check: {error_msg}")
|
| 114 |
-
|
| 115 |
-
# Check if it's a rate limit error
|
| 116 |
-
if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg or "quota" in error_msg.lower():
|
| 117 |
-
raise HTTPException(
|
| 118 |
-
status_code=429,
|
| 119 |
-
detail="API rate limit exceeded. Please try again later or upgrade your API plan. See https://ai.google.dev/gemini-api/docs/rate-limits"
|
| 120 |
-
)
|
| 121 |
-
else:
|
| 122 |
-
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BOQTenders API Server
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
FastAPI application entry point for BOQ extraction and chat services.
|
| 5 |
|
| 6 |
+
Usage:
|
| 7 |
+
uvicorn app:app --host 0.0.0.0 --port 8000 --reload
|
| 8 |
+
"""
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
|
| 12 |
+
# Add project root to path
|
| 13 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 14 |
|
| 15 |
+
from loguru import logger
|
| 16 |
+
from config.settings import settings
|
| 17 |
+
from api.routes import app
|
| 18 |
+
|
| 19 |
+
# Configure logging
|
| 20 |
+
logger.remove()
|
| 21 |
+
logger.add(
|
| 22 |
+
sys.stderr,
|
| 23 |
+
level=settings.log_level,
|
| 24 |
+
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Export app for uvicorn
|
| 28 |
+
__all__ = ["app"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
import uvicorn
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
uvicorn.run(
|
| 35 |
+
"app:app",
|
| 36 |
+
host=settings.api.host,
|
| 37 |
+
port=settings.api.port,
|
| 38 |
+
reload=settings.api.debug,
|
| 39 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
boq_processor.py
DELETED
|
@@ -1,477 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from typing import List, Optional
|
| 3 |
-
import dotenv
|
| 4 |
-
from pydantic_settings import BaseSettings
|
| 5 |
-
from loguru import logger
|
| 6 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
| 8 |
-
from langchain_community.vectorstores import FAISS
|
| 9 |
-
from langchain_google_genai import GoogleGenerativeAI
|
| 10 |
-
import time
|
| 11 |
-
from functools import wraps
|
| 12 |
-
import re
|
| 13 |
-
import requests
|
| 14 |
-
|
| 15 |
-
from langchain_core.documents import Document
|
| 16 |
-
|
| 17 |
-
# Load environment variables
|
| 18 |
-
dotenv.load_dotenv()
|
| 19 |
-
|
| 20 |
-
def retry_with_exponential_backoff(max_retries: int = 3, initial_delay: int = 2, backoff_factor: int = 2):
|
| 21 |
-
"""Decorator for retry logic with exponential backoff for API rate limits."""
|
| 22 |
-
def decorator(func):
|
| 23 |
-
@wraps(func)
|
| 24 |
-
def wrapper(*args, **kwargs):
|
| 25 |
-
last_exception = None
|
| 26 |
-
for attempt in range(max_retries + 1):
|
| 27 |
-
try:
|
| 28 |
-
return func(*args, **kwargs)
|
| 29 |
-
except Exception as e:
|
| 30 |
-
last_exception = e
|
| 31 |
-
error_str = str(e)
|
| 32 |
-
is_rate_limit = ("429" in error_str or "RESOURCE_EXHAUSTED" in error_str or "quota" in error_str.lower())
|
| 33 |
-
if is_rate_limit and attempt < max_retries:
|
| 34 |
-
delay = initial_delay * (backoff_factor ** attempt)
|
| 35 |
-
logger.warning(f"Rate limit encountered. Retry {attempt + 1}/{max_retries} in {delay}s: {error_str}")
|
| 36 |
-
time.sleep(delay)
|
| 37 |
-
else:
|
| 38 |
-
if is_rate_limit and attempt == max_retries:
|
| 39 |
-
logger.error(f"Rate limit exhausted after {max_retries} retries")
|
| 40 |
-
raise
|
| 41 |
-
raise last_exception
|
| 42 |
-
return wrapper
|
| 43 |
-
return decorator
|
| 44 |
-
|
| 45 |
-
class Settings(BaseSettings):
|
| 46 |
-
google_api_key: str = os.getenv("GOOGLE_API_KEY")
|
| 47 |
-
hf_api_token: str = os.getenv("HF_API_TOKEN")
|
| 48 |
-
model_name: str = "gemini-2.5-flash-lite"
|
| 49 |
-
temperature: float = 0.0
|
| 50 |
-
chunk_size: int = 1000
|
| 51 |
-
chunk_overlap: int = 500
|
| 52 |
-
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
| 53 |
-
|
| 54 |
-
settings = Settings()
|
| 55 |
-
|
| 56 |
-
class BOQProcessor:
|
| 57 |
-
def __init__(self):
|
| 58 |
-
self.settings = settings
|
| 59 |
-
self.llm = GoogleGenerativeAI(model=self.settings.model_name, temperature=self.settings.temperature)
|
| 60 |
-
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 61 |
-
chunk_size=self.settings.chunk_size,
|
| 62 |
-
chunk_overlap=self.settings.chunk_overlap
|
| 63 |
-
)
|
| 64 |
-
|
| 65 |
-
def _table_to_markdown(self, table: dict) -> str:
|
| 66 |
-
# Commented out: Table extraction not needed for now
|
| 67 |
-
"""
|
| 68 |
-
headers = table.get('headers', [])
|
| 69 |
-
rows = table.get('rows', [])
|
| 70 |
-
if not headers:
|
| 71 |
-
return ''
|
| 72 |
-
md = '| ' + ' | '.join(headers) + ' |\n'
|
| 73 |
-
md += '|' + '|'.join(['---'] * len(headers)) + '|\n'
|
| 74 |
-
for row in rows:
|
| 75 |
-
md += '| ' + ' | '.join(str(cell) for cell in row) + ' |\n'
|
| 76 |
-
return md
|
| 77 |
-
"""
|
| 78 |
-
|
| 79 |
-
def _call_extract_text_api(self, pdf_path: str, start_page: int = 1, end_page: int = 100, filename: str = None) -> str:
|
| 80 |
-
display_name = filename or os.path.basename(pdf_path)
|
| 81 |
-
logger.info(f'Starting text extraction for {display_name} (pages {start_page}-{end_page})')
|
| 82 |
-
with open(pdf_path, 'rb') as f:
|
| 83 |
-
files = {'file': (os.path.basename(pdf_path), f, 'application/pdf')}
|
| 84 |
-
data = {'start_page': start_page, 'end_page': end_page, 'filename': os.path.basename(pdf_path)}
|
| 85 |
-
headers = {'Authorization': f'Bearer {self.settings.hf_api_token}'}
|
| 86 |
-
response = requests.post(
|
| 87 |
-
'https://point9-extract-text-and-table.hf.space/api/text',
|
| 88 |
-
files=files,
|
| 89 |
-
data=data,
|
| 90 |
-
headers=headers
|
| 91 |
-
)
|
| 92 |
-
response.raise_for_status()
|
| 93 |
-
json_response = response.json()
|
| 94 |
-
if isinstance(json_response, dict):
|
| 95 |
-
result = json_response.get('result', '')
|
| 96 |
-
else:
|
| 97 |
-
logger.error(f"Unexpected response format: {json_response}")
|
| 98 |
-
result = ''
|
| 99 |
-
logger.info(f'Text extraction completed, response length: {len(result)}')
|
| 100 |
-
return result
|
| 101 |
-
|
| 102 |
-
def _call_extract_tables_api(self, pdf_path: str, start_page: int = 1, end_page: int = 2, filename: str = None) -> List[dict]:
|
| 103 |
-
# Commented out: Table extraction not needed for now
|
| 104 |
-
"""
|
| 105 |
-
display_name = filename or os.path.basename(pdf_path)
|
| 106 |
-
logger.info(f'Starting table extraction for {display_name} (pages {start_page}-{end_page})')
|
| 107 |
-
with open(pdf_path, 'rb') as f:
|
| 108 |
-
files = {'file': (os.path.basename(pdf_path), f, 'application/pdf')}
|
| 109 |
-
data = {'start_page': start_page, 'end_page': end_page, 'filename': os.path.basename(pdf_path)}
|
| 110 |
-
headers = {'Authorization': f'Bearer {self.settings.hf_api_token}'}
|
| 111 |
-
response = requests.post(
|
| 112 |
-
'https://point9-extract-text-and-table.hf.space/api/tables',
|
| 113 |
-
files=files,
|
| 114 |
-
data=data,
|
| 115 |
-
headers=headers
|
| 116 |
-
)
|
| 117 |
-
response.raise_for_status()
|
| 118 |
-
json_response = response.json()
|
| 119 |
-
if isinstance(json_response, dict):
|
| 120 |
-
result = json_response.get('result', [])
|
| 121 |
-
# Filter to only include valid table dicts
|
| 122 |
-
valid_tables = [t for t in result if isinstance(t, dict)]
|
| 123 |
-
invalid_count = len(result) - len(valid_tables)
|
| 124 |
-
if invalid_count > 0:
|
| 125 |
-
logger.warning(f"Filtered out {invalid_count} invalid tables (not dicts)")
|
| 126 |
-
result = valid_tables
|
| 127 |
-
else:
|
| 128 |
-
logger.error(f"Unexpected response format: {json_response}")
|
| 129 |
-
result = []
|
| 130 |
-
logger.info(f'Table extraction completed, found {len(result)} valid tables')
|
| 131 |
-
return result
|
| 132 |
-
"""
|
| 133 |
-
|
| 134 |
-
def load_and_process_pdf(self, pdf_path: str, filename: str = None) -> List[Document]:
|
| 135 |
-
try:
|
| 136 |
-
display_name = filename or os.path.basename(pdf_path)
|
| 137 |
-
logger.info(f'Processing PDF from {display_name} using Hugging Face API')
|
| 138 |
-
logger.info('Calling text extraction API...')
|
| 139 |
-
extracted_text = self._call_extract_text_api(pdf_path, filename=filename)
|
| 140 |
-
logger.info(f'Extracted text length: {len(extracted_text)}')
|
| 141 |
-
if extracted_text:
|
| 142 |
-
logger.info(f'Text preview: {extracted_text[:200]}...')
|
| 143 |
-
else:
|
| 144 |
-
logger.warning('Extracted text is empty')
|
| 145 |
-
# Commented out: Table extraction not needed for now
|
| 146 |
-
"""
|
| 147 |
-
logger.info('Calling table extraction API...')
|
| 148 |
-
tables = self._call_extract_tables_api(pdf_path, filename=filename)
|
| 149 |
-
logger.info(f'Extracted {len(tables)} tables')
|
| 150 |
-
logger.info('Converting tables to markdown...')
|
| 151 |
-
table_texts = [self._table_to_markdown(table) for table in tables]
|
| 152 |
-
logger.info(f'Converted {len(table_texts)} tables to markdown')
|
| 153 |
-
full_content = extracted_text + '\n\n' + '\n\n'.join(table_texts)
|
| 154 |
-
"""
|
| 155 |
-
full_content = extracted_text
|
| 156 |
-
logger.info(f'Combined content length: {len(full_content)}')
|
| 157 |
-
logger.info('Splitting content into chunks...')
|
| 158 |
-
chunks = self.text_splitter.create_documents([full_content])
|
| 159 |
-
logger.info(f'Split into {len(chunks)} chunks')
|
| 160 |
-
return chunks
|
| 161 |
-
except Exception as e:
|
| 162 |
-
logger.error(f'Error loading and processing PDF: {e}')
|
| 163 |
-
raise
|
| 164 |
-
|
| 165 |
-
def create_vector_store(self, chunks: List[Document]) -> FAISS:
|
| 166 |
-
try:
|
| 167 |
-
logger.info('Creating embeddings and vector store')
|
| 168 |
-
logger.info(f'Processing {len(chunks)} chunks for embeddings')
|
| 169 |
-
embeddings = HuggingFaceEmbeddings(model_name=self.settings.embedding_model)
|
| 170 |
-
logger.info('Embeddings model loaded, creating FAISS vector store...')
|
| 171 |
-
vector_store = FAISS.from_documents(chunks, embeddings)
|
| 172 |
-
logger.info('Vector store created successfully')
|
| 173 |
-
return vector_store
|
| 174 |
-
except Exception as e:
|
| 175 |
-
logger.error(f'Error creating vector store: {e}')
|
| 176 |
-
raise
|
| 177 |
-
|
| 178 |
-
def setup_rag_chain(self, vector_store: FAISS):
|
| 179 |
-
logger.info('Setting up RAG chain with LangChain classic API (0.1.x)')
|
| 180 |
-
from langchain_classic.chains import ConversationalRetrievalChain
|
| 181 |
-
from langchain_classic.memory import ConversationBufferMemory
|
| 182 |
-
from langchain_core.prompts import PromptTemplate
|
| 183 |
-
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
|
| 184 |
-
qa_template = '''You are an expert assistant specializing in construction and tender documents, with deep knowledge of Bill of Quantities (BOQ) analysis. Your role is to provide accurate, helpful, and professional responses based solely on the provided context.
|
| 185 |
-
|
| 186 |
-
Guidelines:
|
| 187 |
-
- Always base your answers on the given context. Do not use external knowledge or assumptions.
|
| 188 |
-
- For BOQ-related questions, provide detailed, structured information including item codes, descriptions, quantities, units, rates, and amounts where available.
|
| 189 |
-
- If the context lacks specific information, respond with: "The requested information is not available in the provided document context."
|
| 190 |
-
- Be concise yet comprehensive. Structure responses clearly (e.g., use bullet points or tables for lists).
|
| 191 |
-
- Handle follow-up questions by referencing previous context in the conversation history.
|
| 192 |
-
- Maintain neutrality and professionalism in all responses.
|
| 193 |
-
|
| 194 |
-
{context}
|
| 195 |
-
|
| 196 |
-
Question: {question}
|
| 197 |
-
Answer:'''
|
| 198 |
-
qa_prompt = PromptTemplate.from_template(qa_template)
|
| 199 |
-
qa_chain = ConversationalRetrievalChain.from_llm(
|
| 200 |
-
llm=self.llm,
|
| 201 |
-
retriever=vector_store.as_retriever(),
|
| 202 |
-
memory=memory,
|
| 203 |
-
combine_docs_chain_kwargs={'prompt': qa_prompt},
|
| 204 |
-
)
|
| 205 |
-
logger.info('RAG chain set up successfully with LangChain classic API')
|
| 206 |
-
return qa_chain
|
| 207 |
-
|
| 208 |
-
def _extract_metadata(self, chunks: List[Document]) -> str:
|
| 209 |
-
metadata_text = '\n\n'.join([chunk.page_content for chunk in chunks[:3]])
|
| 210 |
-
metadata_prompt = f'''Extract key information from this tender document excerpt in a concise format:
|
| 211 |
-
- Document Type
|
| 212 |
-
- Project Name
|
| 213 |
-
- Issuing Authority
|
| 214 |
-
- Tender Number
|
| 215 |
-
- Date
|
| 216 |
-
- Location
|
| 217 |
-
|
| 218 |
-
Document excerpt:
|
| 219 |
-
{metadata_text[:2000]}
|
| 220 |
-
|
| 221 |
-
Output only the facts, no extra analysis.'''
|
| 222 |
-
logger.info('Invoking LLM for metadata extraction...')
|
| 223 |
-
result = str(self.llm.invoke(metadata_prompt))
|
| 224 |
-
logger.info('Metadata extraction completed')
|
| 225 |
-
return result
|
| 226 |
-
|
| 227 |
-
def _batch_chunks(self, chunks: List[Document], batch_size: int = 24) -> List[List[Document]]:
|
| 228 |
-
return [chunks[i:i + batch_size] for i in range(0, len(chunks), batch_size)]
|
| 229 |
-
|
| 230 |
-
def _extract_boq_from_batch(self, batch_text: str, batch_chunks: List[Document], batch_num: int) -> List[str]:
|
| 231 |
-
extraction_prompt = '''Analyze this text and extract ONLY Bill of Quantities (BOQ) line items if present.
|
| 232 |
-
|
| 233 |
-
Look for structured data with:
|
| 234 |
-
- Item numbers or codes
|
| 235 |
-
- Descriptions of work/materials (extract the complete, full description as it appears in the document, without truncation)
|
| 236 |
-
- Quantities
|
| 237 |
-
- Units (Nos, Sqm, Cum, m, etc.)
|
| 238 |
-
- Rates/Unit prices
|
| 239 |
-
- Total amounts
|
| 240 |
-
|
| 241 |
-
If you find BOQ items, return them in this EXACT format (pipe-separated):
|
| 242 |
-
ITEM_CODE|DESCRIPTION|QUANTITY|UNIT|RATE|AMOUNT|CONFIDENCE
|
| 243 |
-
|
| 244 |
-
Where:
|
| 245 |
-
- CONFIDENCE is a score (0-100%) based on how clearly and completely the data appears in the text. Use lower scores (e.g., 70-90%) if information is partially missing, inferred, or unclear. Use 100% only for complete, directly stated data.
|
| 246 |
-
|
| 247 |
-
Rules for columns:
|
| 248 |
-
- If an entire column has no values, omit that column.
|
| 249 |
-
- For missing values, use "NA".
|
| 250 |
-
|
| 251 |
-
Return multiple items on separate lines. If NO BOQ items are found, return: "NO_BOQ_ITEMS"
|
| 252 |
-
|
| 253 |
-
Text to analyze:
|
| 254 |
-
{batch_text}
|
| 255 |
-
|
| 256 |
-
Extract only actual BOQ line items.'''
|
| 257 |
-
|
| 258 |
-
prompt_text = batch_text[:30000]
|
| 259 |
-
prompt = extraction_prompt.format(batch_text=prompt_text)
|
| 260 |
-
|
| 261 |
-
try:
|
| 262 |
-
logger.info(f'Invoking LLM for BOQ extraction on batch {batch_num}...')
|
| 263 |
-
result = self.llm.invoke(prompt)
|
| 264 |
-
logger.info(f'LLM response received for batch {batch_num}')
|
| 265 |
-
if 'NO_BOQ_ITEMS' in str(result):
|
| 266 |
-
logger.info(f'No BOQ items found in batch {batch_num}')
|
| 267 |
-
return []
|
| 268 |
-
boq_items = []
|
| 269 |
-
lines = str(result).strip().split('\n')
|
| 270 |
-
for line in lines:
|
| 271 |
-
line = line.strip()
|
| 272 |
-
if not line or '|' not in line:
|
| 273 |
-
continue
|
| 274 |
-
parts = [p.strip() for p in line.split('|')]
|
| 275 |
-
if len(parts) < 7:
|
| 276 |
-
parts += ['NA'] * (7 - len(parts))
|
| 277 |
-
# Temp source
|
| 278 |
-
parts.append("Unknown")
|
| 279 |
-
# Per-item page detection
|
| 280 |
-
desc = parts[1]
|
| 281 |
-
search_str = desc[:30].strip().lower()
|
| 282 |
-
batch_text_lower = batch_text.lower()
|
| 283 |
-
pos = batch_text_lower.rfind(search_str)
|
| 284 |
-
if pos != -1:
|
| 285 |
-
marker_pattern = r"(?i)(?:---\s*)?page\s+(\d+)(?:\s*---)?"
|
| 286 |
-
matches = list(re.finditer(marker_pattern, batch_text[:pos]))
|
| 287 |
-
if matches:
|
| 288 |
-
page = matches[-1].group(1)
|
| 289 |
-
parts[7] = f"Page {page}"
|
| 290 |
-
boq_items.append('|'.join(parts[:8]))
|
| 291 |
-
logger.info(f'Extracted {len(boq_items)} BOQ items from batch {batch_num}')
|
| 292 |
-
return boq_items
|
| 293 |
-
except Exception as e:
|
| 294 |
-
logger.warning(f'Error processing batch {batch_num}: {e}')
|
| 295 |
-
return []
|
| 296 |
-
|
| 297 |
-
def _format_boq_output(self, unique_items: List[str], metadata_result: str) -> str:
|
| 298 |
-
logger.info('Formatting BOQ output...')
|
| 299 |
-
if not unique_items:
|
| 300 |
-
logger.info('No BOQ items to format')
|
| 301 |
-
return f'''## DOCUMENT SUMMARY
|
| 302 |
-
{metadata_result}
|
| 303 |
-
|
| 304 |
-
## DETAILED BILL OF QUANTITIES
|
| 305 |
-
No BOQ items were found in this document.'''
|
| 306 |
-
|
| 307 |
-
col_headers = ['Item No/Code', 'Description', 'Quantity', 'Unit', 'Rate', 'Amount', 'Confidence Score', 'Source']
|
| 308 |
-
cols_present = [False] * 8
|
| 309 |
-
normalized_items = []
|
| 310 |
-
for item in unique_items:
|
| 311 |
-
parts = [p.strip() for p in item.split('|')]
|
| 312 |
-
if len(parts) < 8:
|
| 313 |
-
parts += ['NA'] * (8 - len(parts))
|
| 314 |
-
normalized_items.append(parts[:8])
|
| 315 |
-
for i in range(8):
|
| 316 |
-
if parts[i] and parts[i].upper() != 'NA':
|
| 317 |
-
cols_present[i] = True
|
| 318 |
-
|
| 319 |
-
col_indices = [i for i, present in enumerate(cols_present) if present]
|
| 320 |
-
if 0 not in col_indices:
|
| 321 |
-
col_indices.insert(0, 0)
|
| 322 |
-
if 1 not in col_indices:
|
| 323 |
-
col_indices.insert(1, 1)
|
| 324 |
-
|
| 325 |
-
header_row = '| ' + ' | '.join([col_headers[i] for i in col_indices]) + ' |\n'
|
| 326 |
-
sep_row = '|' + '|'.join(['-' * (len(col_headers[i]) + 2) for i in col_indices]) + '|\n'
|
| 327 |
-
|
| 328 |
-
formatted_boq = f'''## DOCUMENT SUMMARY
|
| 329 |
-
{metadata_result}
|
| 330 |
-
|
| 331 |
-
## DETAILED BILL OF QUANTITIES
|
| 332 |
-
**Total Items Found:** {len(unique_items)}
|
| 333 |
-
|
| 334 |
-
{header_row}{sep_row}'''
|
| 335 |
-
|
| 336 |
-
for parts in normalized_items:
|
| 337 |
-
# parts[1] remains full for complete descriptions
|
| 338 |
-
parts[7] = parts[7][:50] if len(parts[7]) > 50 else parts[7] # Truncate source if too long
|
| 339 |
-
row_vals = [parts[i] for i in col_indices]
|
| 340 |
-
# Add % to confidence score if present
|
| 341 |
-
if 6 in col_indices:
|
| 342 |
-
conf_idx = col_indices.index(6)
|
| 343 |
-
if row_vals[conf_idx] != 'NA':
|
| 344 |
-
row_vals[conf_idx] = row_vals[conf_idx].rstrip('%') + '%'
|
| 345 |
-
# Align confidence with source: if source unknown, set confidence to N/A
|
| 346 |
-
if parts[7] == "Unknown":
|
| 347 |
-
row_vals[conf_idx] = "N/A"
|
| 348 |
-
formatted_boq += '| ' + ' | '.join(row_vals) + ' |\n'
|
| 349 |
-
|
| 350 |
-
# formatted_boq += f'\n## SUMMARY\n- **Total Items:** {len(unique_items)}\n'
|
| 351 |
-
|
| 352 |
-
try:
|
| 353 |
-
s = formatted_boq.replace('\r\n', '\n').replace('\r', '\n')
|
| 354 |
-
lines = [ln.lstrip() for ln in s.split('\n')]
|
| 355 |
-
header_idx = next((i for i, ln in enumerate(lines) if ln.startswith('| ')), None)
|
| 356 |
-
if header_idx and header_idx > 0 and lines[header_idx - 1].strip():
|
| 357 |
-
lines.insert(header_idx, '')
|
| 358 |
-
if header_idx:
|
| 359 |
-
sep_idx = header_idx + 1
|
| 360 |
-
if not (sep_idx < len(lines) and re.match(r'^\|\s*-+', lines[sep_idx])):
|
| 361 |
-
cols = [c for c in lines[header_idx].split('|') if c.strip()]
|
| 362 |
-
sep = '|' + '|'.join(['---' for _ in cols]) + '|'
|
| 363 |
-
lines.insert(sep_idx, sep)
|
| 364 |
-
formatted_boq = '\n'.join(lines).strip() + '\n\n'
|
| 365 |
-
except Exception:
|
| 366 |
-
pass
|
| 367 |
-
|
| 368 |
-
return formatted_boq
|
| 369 |
-
|
| 370 |
-
@retry_with_exponential_backoff(max_retries=3, initial_delay=2)
|
| 371 |
-
def extract_boq_comprehensive(self, chunks: List[Document], vector_store: FAISS = None) -> str:
|
| 372 |
-
try:
|
| 373 |
-
logger.info(f'Starting comprehensive BOQ extraction from {len(chunks)} chunks')
|
| 374 |
-
logger.info('Extracting document metadata...')
|
| 375 |
-
metadata_result = self._extract_metadata(chunks)
|
| 376 |
-
logger.info('Metadata extracted, creating batches...')
|
| 377 |
-
batches = self._batch_chunks(chunks)
|
| 378 |
-
logger.info(f'Created {len(batches)} batches')
|
| 379 |
-
boq_items = []
|
| 380 |
-
for batch_num, batch_chunks in enumerate(batches, 1):
|
| 381 |
-
logger.info(f'Processing batch {batch_num}/{len(batches)} ({len(batch_chunks)} chunks)')
|
| 382 |
-
chunk_texts = [chunk.page_content for chunk in batch_chunks]
|
| 383 |
-
batch_text = '\n\n'.join(chunk_texts)
|
| 384 |
-
logger.info(f'Batch text length: {len(batch_text)}')
|
| 385 |
-
batch_items = self._extract_boq_from_batch(batch_text, batch_chunks, batch_num)
|
| 386 |
-
boq_items.extend(batch_items)
|
| 387 |
-
logger.info(f'Batch {batch_num} yielded {len(batch_items)} items')
|
| 388 |
-
unique_items = list(dict.fromkeys(boq_items))
|
| 389 |
-
logger.info(f'Found {len(unique_items)} unique BOQ items after deduplication')
|
| 390 |
-
logger.info('Formatting BOQ output...')
|
| 391 |
-
formatted_boq = self._format_boq_output(unique_items, metadata_result)
|
| 392 |
-
logger.info('Comprehensive BOQ extraction completed successfully')
|
| 393 |
-
return formatted_boq
|
| 394 |
-
except Exception as e:
|
| 395 |
-
logger.error(f'Error in comprehensive BOQ extraction: {e}')
|
| 396 |
-
raise
|
| 397 |
-
|
| 398 |
-
def check_consistency(chunks: List[Document], vector_store: FAISS, runs: int = 4) -> dict:
|
| 399 |
-
"""Run extraction multiple times and compute variance."""
|
| 400 |
-
from difflib import SequenceMatcher
|
| 401 |
-
|
| 402 |
-
results = []
|
| 403 |
-
for _ in range(runs):
|
| 404 |
-
try:
|
| 405 |
-
boq = extract_boq_comprehensive(chunks, vector_store)
|
| 406 |
-
results.append(boq)
|
| 407 |
-
except Exception as e:
|
| 408 |
-
logger.warning(f"Consistency run failed: {e}")
|
| 409 |
-
results.append("")
|
| 410 |
-
|
| 411 |
-
# Variance: Average similarity between pairs
|
| 412 |
-
similarities = []
|
| 413 |
-
for i in range(len(results)):
|
| 414 |
-
for j in range(i+1, len(results)):
|
| 415 |
-
if results[i] and results[j]:
|
| 416 |
-
sim = SequenceMatcher(None, results[i], results[j]).ratio()
|
| 417 |
-
similarities.append(sim)
|
| 418 |
-
|
| 419 |
-
avg_similarity = sum(similarities) / len(similarities) if similarities else 0
|
| 420 |
-
consistency_score = avg_similarity * 100
|
| 421 |
-
|
| 422 |
-
# Average confidence from per-item scores
|
| 423 |
-
all_confidences = []
|
| 424 |
-
for boq in results:
|
| 425 |
-
if boq:
|
| 426 |
-
lines = boq.split('\n')
|
| 427 |
-
confidence_idx = None
|
| 428 |
-
for line in lines:
|
| 429 |
-
line = line.strip()
|
| 430 |
-
if '|' in line and 'Confidence' in line and not line.startswith('| ---'):
|
| 431 |
-
# Header row: find index of Confidence
|
| 432 |
-
parts = [p.strip() for p in line.split('|')[1:-1]]
|
| 433 |
-
confidence_idx = next((i for i, p in enumerate(parts) if 'Confidence' in p), None)
|
| 434 |
-
if confidence_idx is not None:
|
| 435 |
-
break
|
| 436 |
-
if confidence_idx is not None:
|
| 437 |
-
for line in lines:
|
| 438 |
-
if '|' in line and not line.startswith('| ---') and 'Confidence' not in line:
|
| 439 |
-
parts = [p.strip() for p in line.split('|')[1:-1]]
|
| 440 |
-
if len(parts) > confidence_idx:
|
| 441 |
-
try:
|
| 442 |
-
conf_str = parts[confidence_idx]
|
| 443 |
-
if conf_str and conf_str != 'NA':
|
| 444 |
-
# Remove % if present
|
| 445 |
-
conf_str = conf_str.rstrip('%')
|
| 446 |
-
conf = float(conf_str)
|
| 447 |
-
all_confidences.append(conf)
|
| 448 |
-
except (ValueError, IndexError):
|
| 449 |
-
pass
|
| 450 |
-
|
| 451 |
-
avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0
|
| 452 |
-
|
| 453 |
-
return {
|
| 454 |
-
"consistency_score": round(consistency_score, 2),
|
| 455 |
-
"runs": runs,
|
| 456 |
-
"successful_runs": len([r for r in results if r]),
|
| 457 |
-
"avg_similarity": round(avg_similarity, 2),
|
| 458 |
-
"avg_confidence": round(avg_confidence, 2), # New metric
|
| 459 |
-
"total_confidence_scores": len(all_confidences)
|
| 460 |
-
}
|
| 461 |
-
|
| 462 |
-
# Global instance for backward compatibility
|
| 463 |
-
processor = BOQProcessor()
|
| 464 |
-
|
| 465 |
-
# Backward compatibility functions
|
| 466 |
-
def load_and_process_pdf(pdf_path: str, filename: str = None) -> List[Document]:
|
| 467 |
-
return processor.load_and_process_pdf(pdf_path, filename)
|
| 468 |
-
|
| 469 |
-
def create_vector_store(chunks: List[Document]) -> FAISS:
|
| 470 |
-
return processor.create_vector_store(chunks)
|
| 471 |
-
|
| 472 |
-
def setup_rag_chain(vector_store: FAISS):
|
| 473 |
-
return processor.setup_rag_chain(vector_store)
|
| 474 |
-
|
| 475 |
-
@retry_with_exponential_backoff(max_retries=3, initial_delay=2)
|
| 476 |
-
def extract_boq_comprehensive(chunks: List[Document], vector_store: FAISS = None) -> str:
|
| 477 |
-
return processor.extract_boq_comprehensive(chunks, vector_store)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config/settings.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Centralized configuration management for BOQTenders.
|
| 3 |
+
All configurable parameters are defined here with sensible defaults.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
from typing import Optional
|
| 7 |
+
from pydantic_settings import BaseSettings
|
| 8 |
+
from pydantic import Field
|
| 9 |
+
import dotenv
|
| 10 |
+
|
| 11 |
+
# Load environment variables from .env file
|
| 12 |
+
dotenv.load_dotenv()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class LLMSettings(BaseSettings):
|
| 16 |
+
"""LLM-related configuration."""
|
| 17 |
+
|
| 18 |
+
# API Keys
|
| 19 |
+
google_api_key: str = Field(default_factory=lambda: os.getenv("GOOGLE_API_KEY"),description="Google API key for Gemini")
|
| 20 |
+
|
| 21 |
+
# Model Configuration
|
| 22 |
+
model_name: str = Field(default="gemini-2.5-flash-lite",description="LLM model name to use")
|
| 23 |
+
temperature: float = Field(default=0.0,ge=0.0,le=2.0,description="LLM temperature (0.0 = deterministic)")
|
| 24 |
+
max_output_tokens: int = Field(default=8192,description="Maximum tokens in LLM response")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class EmbeddingSettings(BaseSettings):
|
| 28 |
+
"""Embedding and vector store configuration."""
|
| 29 |
+
|
| 30 |
+
embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2",description="HuggingFace embedding model name")
|
| 31 |
+
|
| 32 |
+
# Text Splitting
|
| 33 |
+
chunk_size: int = Field(default=1000,ge=100,le=10000,description="Size of text chunks for splitting")
|
| 34 |
+
chunk_overlap: int = Field(default=500,ge=0,description="Overlap between consecutive chunks")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class PDFExtractionSettings(BaseSettings):
|
| 38 |
+
"""PDF extraction API configuration."""
|
| 39 |
+
|
| 40 |
+
hf_api_token: str = Field(default_factory=lambda: os.getenv("HF_API_TOKEN"),description="HuggingFace API token")
|
| 41 |
+
extraction_api_url: str = Field(default="https://point9-extract-text-and-table.hf.space/api/text",description="URL for PDF text extraction API")
|
| 42 |
+
|
| 43 |
+
start_page: int = Field(default=1,ge=1,description="Default start page for extraction")
|
| 44 |
+
end_page: int = Field(default=100,ge=1,description="Default end page for extraction")
|
| 45 |
+
request_timeout: int = Field(default=120,description="API request timeout in seconds")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class BOQExtractionSettings(BaseSettings):
|
| 49 |
+
"""BOQ extraction specific configuration."""
|
| 50 |
+
|
| 51 |
+
batch_size: int = Field(default=25,ge=1,le=100,description="Number of chunks per batch for BOQ extraction")
|
| 52 |
+
max_prompt_length: int = Field(default=30000,description="Maximum characters in extraction prompt")
|
| 53 |
+
page_search_length: int = Field(default=30,description="Characters to use for page detection search")
|
| 54 |
+
source_max_length: int = Field(default=50,description="Maximum length for source column")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class ConsistencySettings(BaseSettings):
|
| 58 |
+
"""Consistency check configuration."""
|
| 59 |
+
|
| 60 |
+
default_runs: int = Field(default=4,ge=2,le=10,description="Default number of runs for consistency check")
|
| 61 |
+
low_consistency_threshold: float = Field(default=80.0,ge=0.0,le=100.0,description="Threshold below which consistency is considered low")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class APISettings(BaseSettings):
|
| 65 |
+
"""FastAPI server configuration."""
|
| 66 |
+
|
| 67 |
+
title: str = Field(default="BOQ Chatbot API",description="API title")
|
| 68 |
+
description: str = Field(default="API for extracting and querying BOQ from tender PDFs using RAG",description="API description")
|
| 69 |
+
version: str = Field(default="1.0.0",description="API version")
|
| 70 |
+
host: str = Field(default="0.0.0.0",description="Server host")
|
| 71 |
+
port: int = Field(default=8000,description="Server port")
|
| 72 |
+
debug: bool = Field(default=False,description="Enable debug mode")
|
| 73 |
+
docs_enabled: bool = Field(default=True,description="Enable API documentation endpoints")
|
| 74 |
+
cors_origins: list = Field(default=["*"],description="Allowed CORS origins")
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class StreamlitSettings(BaseSettings):
|
| 78 |
+
"""Streamlit UI configuration."""
|
| 79 |
+
|
| 80 |
+
page_title: str = Field(default="BOQ Agent",description="Page title")
|
| 81 |
+
page_icon: str = Field(default="π",description="Page icon")
|
| 82 |
+
layout: str = Field(default="wide",description="Page layout (wide/centered)")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class Settings(BaseSettings):
|
| 86 |
+
"""
|
| 87 |
+
Main settings class that aggregates all configuration sections.
|
| 88 |
+
Access via: settings.llm, settings.embedding, settings.pdf, etc.
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
# Global settings
|
| 92 |
+
log_level: str = Field(default="INFO",description="Logging level (DEBUG, INFO, WARNING, ERROR)")
|
| 93 |
+
|
| 94 |
+
llm: LLMSettings = Field(default_factory=LLMSettings)
|
| 95 |
+
embedding: EmbeddingSettings = Field(default_factory=EmbeddingSettings)
|
| 96 |
+
pdf: PDFExtractionSettings = Field(default_factory=PDFExtractionSettings)
|
| 97 |
+
boq: BOQExtractionSettings = Field(default_factory=BOQExtractionSettings)
|
| 98 |
+
consistency: ConsistencySettings = Field(default_factory=ConsistencySettings)
|
| 99 |
+
api: APISettings = Field(default_factory=APISettings)
|
| 100 |
+
streamlit: StreamlitSettings = Field(default_factory=StreamlitSettings)
|
| 101 |
+
|
| 102 |
+
class Config:
|
| 103 |
+
env_file = ".env"
|
| 104 |
+
env_file_encoding = "utf-8"
|
| 105 |
+
extra = "ignore" # Ignore extra env vars not defined in model
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# Global settings instance
|
| 109 |
+
settings = Settings()
|
core/embeddings.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Embedding and vector store management module.
|
| 3 |
+
"""
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
from loguru import logger
|
| 6 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 8 |
+
from langchain_community.vectorstores import FAISS
|
| 9 |
+
from langchain_core.documents import Document
|
| 10 |
+
|
| 11 |
+
from config.settings import settings
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class EmbeddingService:
|
| 15 |
+
"""
|
| 16 |
+
Handles text chunking, embeddings, and vector store operations.
|
| 17 |
+
|
| 18 |
+
Example:
|
| 19 |
+
service = EmbeddingService()
|
| 20 |
+
chunks = service.split_text(text)
|
| 21 |
+
vector_store = service.create_vector_store(chunks)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, embedding_model: str = None, chunk_size: int = None, chunk_overlap: int = None):
|
| 25 |
+
"""
|
| 26 |
+
Initialize embedding service.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
embedding_model: HuggingFace model name. Defaults to config value.
|
| 30 |
+
chunk_size: Size of text chunks. Defaults to config value.
|
| 31 |
+
chunk_overlap: Overlap between chunks. Defaults to config value.
|
| 32 |
+
"""
|
| 33 |
+
self.embedding_model = embedding_model or settings.embedding.embedding_model
|
| 34 |
+
self.chunk_size = chunk_size or settings.embedding.chunk_size
|
| 35 |
+
self.chunk_overlap = chunk_overlap or settings.embedding.chunk_overlap
|
| 36 |
+
|
| 37 |
+
self._text_splitter = RecursiveCharacterTextSplitter(
|
| 38 |
+
chunk_size=self.chunk_size,
|
| 39 |
+
chunk_overlap=self.chunk_overlap
|
| 40 |
+
)
|
| 41 |
+
self._embeddings: Optional[HuggingFaceEmbeddings] = None
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def embeddings(self) -> HuggingFaceEmbeddings:
|
| 45 |
+
"""Lazy-load embeddings model."""
|
| 46 |
+
if self._embeddings is None:
|
| 47 |
+
logger.info(f'Loading embeddings model: {self.embedding_model}')
|
| 48 |
+
self._embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model)
|
| 49 |
+
logger.info('Embeddings model loaded successfully')
|
| 50 |
+
return self._embeddings
|
| 51 |
+
|
| 52 |
+
def split_text(self, text: str) -> List[Document]:
|
| 53 |
+
"""
|
| 54 |
+
Split text into chunks for processing.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
text: Full text content to split.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
List of Document chunks.
|
| 61 |
+
"""
|
| 62 |
+
logger.info(f'Splitting text of length {len(text)} into chunks...')
|
| 63 |
+
chunks = self._text_splitter.create_documents([text])
|
| 64 |
+
logger.info(f'Split into {len(chunks)} chunks')
|
| 65 |
+
return chunks
|
| 66 |
+
|
| 67 |
+
def create_vector_store(self, chunks: List[Document]) -> FAISS:
|
| 68 |
+
"""
|
| 69 |
+
Create a FAISS vector store from document chunks.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
chunks: List of Document objects.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
FAISS vector store instance.
|
| 76 |
+
|
| 77 |
+
Raises:
|
| 78 |
+
Exception: If vector store creation fails.
|
| 79 |
+
"""
|
| 80 |
+
try:
|
| 81 |
+
logger.info(f'Creating vector store from {len(chunks)} chunks')
|
| 82 |
+
vector_store = FAISS.from_documents(chunks, self.embeddings)
|
| 83 |
+
logger.info('Vector store created successfully')
|
| 84 |
+
return vector_store
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f'Error creating vector store: {e}')
|
| 87 |
+
raise
|
| 88 |
+
|
| 89 |
+
def add_documents(self, vector_store: FAISS, documents: List[Document]) -> None:
|
| 90 |
+
"""
|
| 91 |
+
Add new documents to an existing vector store.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
vector_store: Existing FAISS vector store.
|
| 95 |
+
documents: Documents to add.
|
| 96 |
+
"""
|
| 97 |
+
logger.info(f'Adding {len(documents)} documents to vector store')
|
| 98 |
+
vector_store.add_documents(documents)
|
| 99 |
+
logger.info('Documents added successfully')
|
| 100 |
+
|
| 101 |
+
def similarity_search(
|
| 102 |
+
self,
|
| 103 |
+
vector_store: FAISS,
|
| 104 |
+
query: str,
|
| 105 |
+
k: int = 4
|
| 106 |
+
) -> List[Document]:
|
| 107 |
+
"""
|
| 108 |
+
Perform similarity search on vector store.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
vector_store: FAISS vector store to search.
|
| 112 |
+
query: Search query.
|
| 113 |
+
k: Number of results to return.
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
List of most similar documents.
|
| 117 |
+
"""
|
| 118 |
+
return vector_store.similarity_search(query, k=k)
|
core/llm.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM client wrapper for Google Generative AI (Gemini).
|
| 3 |
+
"""
|
| 4 |
+
from typing import Optional
|
| 5 |
+
from loguru import logger
|
| 6 |
+
from langchain_google_genai import GoogleGenerativeAI
|
| 7 |
+
|
| 8 |
+
from config.settings import settings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class LLMClient:
|
| 12 |
+
"""
|
| 13 |
+
Wrapper for LLM interactions with Google Generative AI.
|
| 14 |
+
|
| 15 |
+
Example:
|
| 16 |
+
client = LLMClient()
|
| 17 |
+
response = client.invoke("What is a BOQ?")
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, model_name: str = None, temperature: float = None, api_key: str = None):
|
| 21 |
+
"""
|
| 22 |
+
Initialize LLM client.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
model_name: Model to use. Defaults to config value.
|
| 26 |
+
temperature: Sampling temperature. Defaults to config value.
|
| 27 |
+
api_key: Google API key. Defaults to config value.
|
| 28 |
+
"""
|
| 29 |
+
self.model_name = model_name or settings.llm.model_name
|
| 30 |
+
self.temperature = temperature if temperature is not None else settings.llm.temperature
|
| 31 |
+
self.api_key = api_key or settings.llm.google_api_key
|
| 32 |
+
|
| 33 |
+
self._llm: Optional[GoogleGenerativeAI] = None
|
| 34 |
+
|
| 35 |
+
@property
|
| 36 |
+
def llm(self) -> GoogleGenerativeAI:
|
| 37 |
+
"""Lazy-load LLM instance."""
|
| 38 |
+
if self._llm is None:
|
| 39 |
+
logger.info(f'Initializing LLM: {self.model_name} (temp={self.temperature})')
|
| 40 |
+
self._llm = GoogleGenerativeAI(
|
| 41 |
+
model=self.model_name,
|
| 42 |
+
temperature=self.temperature,
|
| 43 |
+
google_api_key=self.api_key
|
| 44 |
+
)
|
| 45 |
+
logger.info('LLM initialized successfully')
|
| 46 |
+
return self._llm
|
| 47 |
+
|
| 48 |
+
def invoke(self, prompt: str) -> str:
|
| 49 |
+
"""
|
| 50 |
+
Send a prompt to the LLM and get a response.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
prompt: The prompt text.
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
LLM response as string.
|
| 57 |
+
"""
|
| 58 |
+
logger.debug(f'Invoking LLM with prompt of length {len(prompt)}')
|
| 59 |
+
result = str(self.llm.invoke(prompt))
|
| 60 |
+
logger.debug(f'LLM response received, length: {len(result)}')
|
| 61 |
+
return result
|
| 62 |
+
|
| 63 |
+
def batch_invoke(self, prompts: list[str]) -> list[str]:
|
| 64 |
+
"""
|
| 65 |
+
Send multiple prompts to the LLM.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
prompts: List of prompt texts.
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
List of LLM responses.
|
| 72 |
+
"""
|
| 73 |
+
logger.info(f'Batch invoking LLM with {len(prompts)} prompts')
|
| 74 |
+
results = []
|
| 75 |
+
for i, prompt in enumerate(prompts, 1):
|
| 76 |
+
logger.debug(f'Processing prompt {i}/{len(prompts)}')
|
| 77 |
+
results.append(self.invoke(prompt))
|
| 78 |
+
return results
|
core/pdf_extractor.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF text extraction module using external API.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from typing import Optional
|
| 6 |
+
import requests
|
| 7 |
+
from loguru import logger
|
| 8 |
+
|
| 9 |
+
from config.settings import settings
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class PDFExtractor:
|
| 13 |
+
"""
|
| 14 |
+
Handles PDF text extraction using HuggingFace Space API.
|
| 15 |
+
|
| 16 |
+
Example:
|
| 17 |
+
extractor = PDFExtractor()
|
| 18 |
+
text = extractor.extract_text("document.pdf")
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, api_url: str = None, api_token: str = None, timeout: int = None):
|
| 22 |
+
"""
|
| 23 |
+
Initialize PDF extractor.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
api_url: URL for extraction API. Defaults to config value.
|
| 27 |
+
api_token: HuggingFace API token. Defaults to config value.
|
| 28 |
+
timeout: Request timeout in seconds. Defaults to config value.
|
| 29 |
+
"""
|
| 30 |
+
self.api_url = api_url or settings.pdf.extraction_api_url
|
| 31 |
+
self.api_token = api_token or settings.pdf.hf_api_token
|
| 32 |
+
self.timeout = timeout or settings.pdf.request_timeout
|
| 33 |
+
|
| 34 |
+
def extract_text(self, pdf_path: str, start_page: int = None, end_page: int = None, filename: str = None) -> str:
|
| 35 |
+
"""
|
| 36 |
+
Extract text from a PDF file.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
pdf_path: Path to the PDF file.
|
| 40 |
+
start_page: Starting page number (1-indexed). Defaults to config value.
|
| 41 |
+
end_page: Ending page number. Defaults to config value.
|
| 42 |
+
filename: Display name for logging. Defaults to basename of pdf_path.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Extracted text content.
|
| 46 |
+
|
| 47 |
+
Raises:
|
| 48 |
+
requests.RequestException: If API request fails.
|
| 49 |
+
FileNotFoundError: If PDF file doesn't exist.
|
| 50 |
+
"""
|
| 51 |
+
# Use config defaults if not specified
|
| 52 |
+
start_page = start_page or settings.pdf.start_page
|
| 53 |
+
end_page = end_page or settings.pdf.end_page
|
| 54 |
+
display_name = filename or os.path.basename(pdf_path)
|
| 55 |
+
|
| 56 |
+
logger.info(f'Starting text extraction for {display_name} (pages {start_page}-{end_page})')
|
| 57 |
+
|
| 58 |
+
if not os.path.exists(pdf_path):
|
| 59 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
| 60 |
+
|
| 61 |
+
with open(pdf_path, 'rb') as f:
|
| 62 |
+
files = {'file': (os.path.basename(pdf_path), f, 'application/pdf')}
|
| 63 |
+
data = {
|
| 64 |
+
'start_page': start_page,
|
| 65 |
+
'end_page': end_page,
|
| 66 |
+
'filename': os.path.basename(pdf_path)
|
| 67 |
+
}
|
| 68 |
+
headers = {'Authorization': f'Bearer {self.api_token}'}
|
| 69 |
+
|
| 70 |
+
response = requests.post(
|
| 71 |
+
self.api_url,
|
| 72 |
+
files=files,
|
| 73 |
+
data=data,
|
| 74 |
+
headers=headers,
|
| 75 |
+
timeout=self.timeout
|
| 76 |
+
)
|
| 77 |
+
response.raise_for_status()
|
| 78 |
+
|
| 79 |
+
json_response = response.json()
|
| 80 |
+
if isinstance(json_response, dict):
|
| 81 |
+
result = json_response.get('result', '')
|
| 82 |
+
else:
|
| 83 |
+
logger.error(f"Unexpected response format: {json_response}")
|
| 84 |
+
result = ''
|
| 85 |
+
|
| 86 |
+
logger.info(f'Text extraction completed, response length: {len(result)}')
|
| 87 |
+
return result
|
| 88 |
+
|
| 89 |
+
def extract_text_preview(self, pdf_path: str, max_chars: int = 200) -> str:
|
| 90 |
+
"""
|
| 91 |
+
Extract and return a preview of the PDF text.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
pdf_path: Path to the PDF file.
|
| 95 |
+
max_chars: Maximum characters to return.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Preview of extracted text.
|
| 99 |
+
"""
|
| 100 |
+
text = self.extract_text(pdf_path, start_page=1, end_page=5)
|
| 101 |
+
return text[:max_chars] + "..." if len(text) > max_chars else text
|
core/rag_chain.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG chain builder for conversational retrieval.
|
| 3 |
+
"""
|
| 4 |
+
from loguru import logger
|
| 5 |
+
from langchain_community.vectorstores import FAISS
|
| 6 |
+
from langchain_classic.chains import ConversationalRetrievalChain
|
| 7 |
+
from langchain_classic.memory import ConversationBufferMemory
|
| 8 |
+
from langchain_core.prompts import PromptTemplate
|
| 9 |
+
|
| 10 |
+
from core.llm import LLMClient
|
| 11 |
+
from prompts.get_prompts import QA_TEMPLATE
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class RAGChainBuilder:
|
| 15 |
+
"""
|
| 16 |
+
Builder for RAG (Retrieval-Augmented Generation) chains.
|
| 17 |
+
|
| 18 |
+
Example:
|
| 19 |
+
builder = RAGChainBuilder()
|
| 20 |
+
chain = builder.build(vector_store)
|
| 21 |
+
response = chain({"question": "What is the total quantity?"})
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, llm_client: LLMClient = None):
|
| 25 |
+
"""
|
| 26 |
+
Initialize RAG chain builder.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
llm_client: LLM client instance. Creates new one if not provided.
|
| 30 |
+
"""
|
| 31 |
+
self.llm_client = llm_client or LLMClient()
|
| 32 |
+
|
| 33 |
+
def build(self, vector_store: FAISS, qa_template: str = None, memory_key: str = "chat_history", return_messages: bool = True) -> ConversationalRetrievalChain:
|
| 34 |
+
"""
|
| 35 |
+
Build a conversational retrieval chain.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
vector_store: FAISS vector store with document embeddings.
|
| 39 |
+
qa_template: Custom Q&A prompt template. Defaults to standard template.
|
| 40 |
+
memory_key: Key for conversation memory.
|
| 41 |
+
return_messages: Whether to return messages in memory.
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
Configured ConversationalRetrievalChain.
|
| 45 |
+
"""
|
| 46 |
+
logger.info('Building RAG chain with LangChain classic API')
|
| 47 |
+
|
| 48 |
+
# Create memory
|
| 49 |
+
memory = ConversationBufferMemory(
|
| 50 |
+
memory_key=memory_key,
|
| 51 |
+
return_messages=return_messages
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Create prompt
|
| 55 |
+
template = qa_template or QA_TEMPLATE
|
| 56 |
+
qa_prompt = PromptTemplate.from_template(template)
|
| 57 |
+
|
| 58 |
+
# Build chain
|
| 59 |
+
chain = ConversationalRetrievalChain.from_llm(
|
| 60 |
+
llm=self.llm_client.llm,
|
| 61 |
+
retriever=vector_store.as_retriever(),
|
| 62 |
+
memory=memory,
|
| 63 |
+
combine_docs_chain_kwargs={'prompt': qa_prompt},
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
logger.info('RAG chain built successfully')
|
| 67 |
+
return chain
|
| 68 |
+
|
| 69 |
+
def build_simple_retriever(self, vector_store: FAISS, k: int = 4):
|
| 70 |
+
"""
|
| 71 |
+
Build a simple retriever without conversation memory.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
vector_store: FAISS vector store.
|
| 75 |
+
k: Number of documents to retrieve.
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Retriever instance.
|
| 79 |
+
"""
|
| 80 |
+
return vector_store.as_retriever(search_kwargs={"k": k})
|
prompts/get_prompts.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Load prompt templates from YAML file for BOQ extraction.
|
| 3 |
+
"""
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import yaml
|
| 6 |
+
|
| 7 |
+
# Load templates from YAML
|
| 8 |
+
_TEMPLATES_PATH = Path(__file__).parent / "templates.yaml"
|
| 9 |
+
|
| 10 |
+
with open(_TEMPLATES_PATH, 'r', encoding='utf-8') as f:
|
| 11 |
+
_templates = yaml.safe_load(f)
|
| 12 |
+
|
| 13 |
+
# Export as module constants
|
| 14 |
+
QA_TEMPLATE = _templates['qa_template']
|
| 15 |
+
METADATA_EXTRACTION_TEMPLATE = _templates['metadata_extraction_template']
|
| 16 |
+
BOQ_EXTRACTION_TEMPLATE = _templates['boq_extraction_template']
|
| 17 |
+
BOQ_COLUMN_HEADERS = _templates['boq_column_headers']
|
prompts/templates.yaml
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BOQTenders Prompt Templates
|
| 2 |
+
# YAML format for better production management and versioning
|
| 3 |
+
|
| 4 |
+
qa_template: |
|
| 5 |
+
You are an expert assistant specializing in construction and tender documents, with deep knowledge of Bill of Quantities (BOQ) analysis. Your role is to provide accurate, helpful, and professional responses based solely on the provided context.
|
| 6 |
+
|
| 7 |
+
Guidelines:
|
| 8 |
+
- Always base your answers on the given context. Do not use external knowledge or assumptions.
|
| 9 |
+
- For BOQ-related questions, provide detailed, structured information including item codes, descriptions, quantities, units, rates, and amounts where available.
|
| 10 |
+
- If the context lacks specific information, respond with: "The requested information is not available in the provided document context."
|
| 11 |
+
- Be concise yet comprehensive. Structure responses clearly (e.g., use bullet points or tables for lists).
|
| 12 |
+
- Handle follow-up questions by referencing previous context in the conversation history.
|
| 13 |
+
- Maintain neutrality and professionalism in all responses.
|
| 14 |
+
|
| 15 |
+
{context}
|
| 16 |
+
|
| 17 |
+
Question: {question}
|
| 18 |
+
Answer:
|
| 19 |
+
|
| 20 |
+
metadata_extraction_template: |
|
| 21 |
+
Extract key information from this tender document excerpt in a concise format:
|
| 22 |
+
- Document Type
|
| 23 |
+
- Project Name
|
| 24 |
+
- Issuing Authority
|
| 25 |
+
- Tender Number
|
| 26 |
+
- Date
|
| 27 |
+
- Location
|
| 28 |
+
|
| 29 |
+
Document excerpt:
|
| 30 |
+
{document_text}
|
| 31 |
+
|
| 32 |
+
Output only the facts, no extra analysis.
|
| 33 |
+
|
| 34 |
+
boq_extraction_template: |
|
| 35 |
+
Analyze this text and extract ONLY Bill of Quantities (BOQ) line items if present.
|
| 36 |
+
|
| 37 |
+
Look for structured data with:
|
| 38 |
+
- Item numbers or codes
|
| 39 |
+
- Descriptions of work/materials (extract the complete, full description as it appears in the document, without truncation)
|
| 40 |
+
- Quantities
|
| 41 |
+
- Units (Nos, Sqm, Cum, m, etc.)
|
| 42 |
+
- Rates/Unit prices
|
| 43 |
+
- Total amounts
|
| 44 |
+
|
| 45 |
+
If you find BOQ items, return them in this EXACT format (pipe-separated):
|
| 46 |
+
ITEM_CODE|DESCRIPTION|QUANTITY|UNIT|RATE|AMOUNT|CONFIDENCE
|
| 47 |
+
|
| 48 |
+
Where:
|
| 49 |
+
- CONFIDENCE is a score (0-100%) based on how clearly and completely the data appears in the text. Use lower scores (e.g., 70-90%) if information is partially missing, inferred, or unclear. Use 100% only for complete, directly stated data.
|
| 50 |
+
|
| 51 |
+
Rules for columns:
|
| 52 |
+
- If an entire column has no values, omit that column.
|
| 53 |
+
- For missing values, use "NA".
|
| 54 |
+
|
| 55 |
+
Return multiple items on separate lines. If NO BOQ items are found, return: "NO_BOQ_ITEMS"
|
| 56 |
+
|
| 57 |
+
Text to analyze:
|
| 58 |
+
{batch_text}
|
| 59 |
+
|
| 60 |
+
Extract only actual BOQ line items.
|
| 61 |
+
|
| 62 |
+
# Column headers for BOQ output table
|
| 63 |
+
boq_column_headers:
|
| 64 |
+
- "Item No/Code"
|
| 65 |
+
- "Description"
|
| 66 |
+
- "Quantity"
|
| 67 |
+
- "Unit"
|
| 68 |
+
- "Rate"
|
| 69 |
+
- "Amount"
|
| 70 |
+
- "Confidence Score"
|
| 71 |
+
- "Source"
|
requirements.txt
CHANGED
|
@@ -1,15 +1,51 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# BOQTenders - Requirements
|
| 2 |
+
# Python 3.9+
|
| 3 |
+
|
| 4 |
+
# ============================================
|
| 5 |
+
# Core Dependencies
|
| 6 |
+
# ============================================
|
| 7 |
+
python-dotenv>=1.0.0
|
| 8 |
+
pydantic>=2.0.0
|
| 9 |
+
pydantic-settings>=2.0.0
|
| 10 |
+
loguru>=0.7.0
|
| 11 |
+
PyYAML>=6.0.0
|
| 12 |
+
|
| 13 |
+
# ============================================
|
| 14 |
+
# LLM & AI
|
| 15 |
+
# ============================================
|
| 16 |
+
# LangChain 0.1.x (Classic API - stable)
|
| 17 |
+
langchain>=0.1.0,<0.2.0
|
| 18 |
+
langchain-community>=0.0.1,<0.1.0
|
| 19 |
+
langchain-google-genai>=0.0.5
|
| 20 |
+
|
| 21 |
+
# Vector Store
|
| 22 |
+
faiss-cpu>=1.7.4
|
| 23 |
+
|
| 24 |
+
# Embeddings
|
| 25 |
+
sentence-transformers>=2.2.0
|
| 26 |
+
|
| 27 |
+
# ============================================
|
| 28 |
+
# PDF Processing
|
| 29 |
+
# ============================================
|
| 30 |
+
PyPDF2>=3.0.0
|
| 31 |
+
requests>=2.31.0
|
| 32 |
+
|
| 33 |
+
# ============================================
|
| 34 |
+
# Web Framework
|
| 35 |
+
# ============================================
|
| 36 |
+
# API
|
| 37 |
+
fastapi>=0.100.0
|
| 38 |
+
uvicorn[standard]>=0.23.0
|
| 39 |
+
python-multipart>=0.0.6
|
| 40 |
+
|
| 41 |
+
# UI
|
| 42 |
+
streamlit>=1.28.0
|
| 43 |
+
|
| 44 |
+
# ============================================
|
| 45 |
+
# Development (optional)
|
| 46 |
+
# ============================================
|
| 47 |
+
# pytest>=7.4.0
|
| 48 |
+
# pytest-asyncio>=0.21.0
|
| 49 |
+
# black>=23.0.0
|
| 50 |
+
# isort>=5.12.0
|
| 51 |
+
# mypy>=1.5.0
|
services/boq_extractor.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BOQ extraction service for extracting Bill of Quantities from documents.
|
| 3 |
+
"""
|
| 4 |
+
import re
|
| 5 |
+
from typing import List, Optional, Tuple
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from langchain_core.documents import Document
|
| 8 |
+
from langchain_community.vectorstores import FAISS
|
| 9 |
+
|
| 10 |
+
from config.settings import settings
|
| 11 |
+
from core.llm import LLMClient
|
| 12 |
+
from prompts.get_prompts import (
|
| 13 |
+
METADATA_EXTRACTION_TEMPLATE,
|
| 14 |
+
BOQ_EXTRACTION_TEMPLATE,
|
| 15 |
+
BOQ_COLUMN_HEADERS,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Page marker pattern for detecting source pages
|
| 19 |
+
PAGE_MARKER_PATTERN = r"(?i)(?:---\s*)?page\s+(\d+)(?:\s*---)?"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class BOQExtractor:
|
| 23 |
+
"""
|
| 24 |
+
Extracts Bill of Quantities (BOQ) data from document chunks.
|
| 25 |
+
|
| 26 |
+
Example:
|
| 27 |
+
extractor = BOQExtractor()
|
| 28 |
+
boq_output = extractor.extract(chunks)
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, llm_client: LLMClient = None, batch_size: int = None, max_prompt_length: int = None, page_search_length: int = None):
|
| 32 |
+
"""
|
| 33 |
+
Initialize BOQ extractor.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
llm_client: LLM client instance. Creates new one if not provided.
|
| 37 |
+
batch_size: Number of chunks per batch. Defaults to config value.
|
| 38 |
+
max_prompt_length: Max chars in prompt. Defaults to config value.
|
| 39 |
+
page_search_length: Chars for page detection. Defaults to config value.
|
| 40 |
+
"""
|
| 41 |
+
self.llm_client = llm_client or LLMClient()
|
| 42 |
+
self.batch_size = batch_size or settings.boq.batch_size
|
| 43 |
+
self.max_prompt_length = max_prompt_length or settings.boq.max_prompt_length
|
| 44 |
+
self.page_search_length = page_search_length or settings.boq.page_search_length
|
| 45 |
+
self.source_max_length = settings.boq.source_max_length
|
| 46 |
+
|
| 47 |
+
def _batch_chunks(self, chunks: List[Document]) -> List[List[Document]]:
|
| 48 |
+
"""Split chunks into batches."""
|
| 49 |
+
return [
|
| 50 |
+
chunks[i:i + self.batch_size]
|
| 51 |
+
for i in range(0, len(chunks), self.batch_size)
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
def _extract_metadata(self, chunks: List[Document]) -> str:
|
| 55 |
+
"""Extract document metadata from first few chunks."""
|
| 56 |
+
metadata_text = '\n\n'.join([chunk.page_content for chunk in chunks[:3]])
|
| 57 |
+
prompt = METADATA_EXTRACTION_TEMPLATE.format(
|
| 58 |
+
document_text=metadata_text[:2000]
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
logger.info('Invoking LLM for metadata extraction...')
|
| 62 |
+
result = self.llm_client.invoke(prompt)
|
| 63 |
+
logger.info('Metadata extraction completed')
|
| 64 |
+
return result
|
| 65 |
+
|
| 66 |
+
def _detect_page_source(self, desc: str, batch_text: str) -> str:
|
| 67 |
+
"""
|
| 68 |
+
Detect the page number for a BOQ item based on its description.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
desc: Item description.
|
| 72 |
+
batch_text: Full batch text to search in.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Page source string (e.g., "Page 5") or "Unknown".
|
| 76 |
+
"""
|
| 77 |
+
search_str = desc[:self.page_search_length].strip().lower()
|
| 78 |
+
batch_text_lower = batch_text.lower()
|
| 79 |
+
pos = batch_text_lower.rfind(search_str)
|
| 80 |
+
|
| 81 |
+
if pos != -1:
|
| 82 |
+
matches = list(re.finditer(PAGE_MARKER_PATTERN, batch_text[:pos]))
|
| 83 |
+
if matches:
|
| 84 |
+
page = matches[-1].group(1)
|
| 85 |
+
return f"Page {page}"
|
| 86 |
+
|
| 87 |
+
return "Unknown"
|
| 88 |
+
|
| 89 |
+
def _parse_boq_line(self, line: str, batch_text: str) -> Optional[str]:
|
| 90 |
+
"""
|
| 91 |
+
Parse a single BOQ line from LLM output.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
line: Raw line from LLM output.
|
| 95 |
+
batch_text: Full batch text for page detection.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Formatted BOQ item string or None if invalid.
|
| 99 |
+
"""
|
| 100 |
+
line = line.strip()
|
| 101 |
+
if not line or '|' not in line:
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
parts = [p.strip() for p in line.split('|')]
|
| 105 |
+
if len(parts) < 7:
|
| 106 |
+
parts += ['NA'] * (7 - len(parts))
|
| 107 |
+
|
| 108 |
+
# Add source column
|
| 109 |
+
parts.append("Unknown")
|
| 110 |
+
|
| 111 |
+
# Detect page source
|
| 112 |
+
desc = parts[1]
|
| 113 |
+
parts[7] = self._detect_page_source(desc, batch_text)
|
| 114 |
+
|
| 115 |
+
return '|'.join(parts[:8])
|
| 116 |
+
|
| 117 |
+
def _extract_from_batch(self, batch_text: str, batch_num: int) -> List[str]:
|
| 118 |
+
"""
|
| 119 |
+
Extract BOQ items from a single batch.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
batch_text: Combined text from batch chunks.
|
| 123 |
+
batch_num: Batch number for logging.
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
List of BOQ item strings.
|
| 127 |
+
"""
|
| 128 |
+
prompt_text = batch_text[:self.max_prompt_length]
|
| 129 |
+
prompt = BOQ_EXTRACTION_TEMPLATE.format(batch_text=prompt_text)
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
logger.info(f'Invoking LLM for BOQ extraction on batch {batch_num}...')
|
| 133 |
+
result = self.llm_client.invoke(prompt)
|
| 134 |
+
logger.info(f'LLM response received for batch {batch_num}')
|
| 135 |
+
|
| 136 |
+
if 'NO_BOQ_ITEMS' in result:
|
| 137 |
+
logger.info(f'No BOQ items found in batch {batch_num}')
|
| 138 |
+
return []
|
| 139 |
+
|
| 140 |
+
boq_items = []
|
| 141 |
+
lines = result.strip().split('\n')
|
| 142 |
+
|
| 143 |
+
for line in lines:
|
| 144 |
+
parsed = self._parse_boq_line(line, batch_text)
|
| 145 |
+
if parsed:
|
| 146 |
+
boq_items.append(parsed)
|
| 147 |
+
|
| 148 |
+
logger.info(f'Extracted {len(boq_items)} BOQ items from batch {batch_num}')
|
| 149 |
+
return boq_items
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
logger.warning(f'Error processing batch {batch_num}: {e}')
|
| 153 |
+
return []
|
| 154 |
+
|
| 155 |
+
def _format_output(self, unique_items: List[str], metadata_result: str) -> str:
|
| 156 |
+
"""
|
| 157 |
+
Format extracted BOQ items into markdown output.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
unique_items: List of unique BOQ item strings.
|
| 161 |
+
metadata_result: Document metadata.
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
Formatted markdown string.
|
| 165 |
+
"""
|
| 166 |
+
logger.info('Formatting BOQ output...')
|
| 167 |
+
|
| 168 |
+
if not unique_items:
|
| 169 |
+
logger.info('No BOQ items to format')
|
| 170 |
+
return f'''## DOCUMENT SUMMARY
|
| 171 |
+
{metadata_result}
|
| 172 |
+
|
| 173 |
+
## DETAILED BILL OF QUANTITIES
|
| 174 |
+
No BOQ items were found in this document.'''
|
| 175 |
+
|
| 176 |
+
# Determine which columns have data
|
| 177 |
+
cols_present = [False] * 8
|
| 178 |
+
normalized_items = []
|
| 179 |
+
|
| 180 |
+
for item in unique_items:
|
| 181 |
+
parts = [p.strip() for p in item.split('|')]
|
| 182 |
+
if len(parts) < 8:
|
| 183 |
+
parts += ['NA'] * (8 - len(parts))
|
| 184 |
+
normalized_items.append(parts[:8])
|
| 185 |
+
|
| 186 |
+
for i in range(8):
|
| 187 |
+
if parts[i] and parts[i].upper() != 'NA':
|
| 188 |
+
cols_present[i] = True
|
| 189 |
+
|
| 190 |
+
# Build column indices (always include item code and description)
|
| 191 |
+
col_indices = [i for i, present in enumerate(cols_present) if present]
|
| 192 |
+
if 0 not in col_indices:
|
| 193 |
+
col_indices.insert(0, 0)
|
| 194 |
+
if 1 not in col_indices:
|
| 195 |
+
col_indices.insert(1, 1)
|
| 196 |
+
|
| 197 |
+
# Build header and separator rows
|
| 198 |
+
header_row = '| ' + ' | '.join([BOQ_COLUMN_HEADERS[i] for i in col_indices]) + ' |\n'
|
| 199 |
+
sep_row = '|' + '|'.join(['-' * (len(BOQ_COLUMN_HEADERS[i]) + 2) for i in col_indices]) + '|\n'
|
| 200 |
+
|
| 201 |
+
formatted_boq = f'''## DOCUMENT SUMMARY
|
| 202 |
+
{metadata_result}
|
| 203 |
+
|
| 204 |
+
## DETAILED BILL OF QUANTITIES
|
| 205 |
+
**Total Items Found:** {len(unique_items)}
|
| 206 |
+
|
| 207 |
+
{header_row}{sep_row}'''
|
| 208 |
+
|
| 209 |
+
# Add data rows
|
| 210 |
+
for parts in normalized_items:
|
| 211 |
+
# Truncate source if too long
|
| 212 |
+
parts[7] = parts[7][:self.source_max_length] if len(parts[7]) > self.source_max_length else parts[7]
|
| 213 |
+
row_vals = [parts[i] for i in col_indices]
|
| 214 |
+
|
| 215 |
+
# Format confidence score
|
| 216 |
+
if 6 in col_indices:
|
| 217 |
+
conf_idx = col_indices.index(6)
|
| 218 |
+
if row_vals[conf_idx] != 'NA':
|
| 219 |
+
row_vals[conf_idx] = row_vals[conf_idx].rstrip('%') + '%'
|
| 220 |
+
if parts[7] == "Unknown":
|
| 221 |
+
row_vals[conf_idx] = "N/A"
|
| 222 |
+
|
| 223 |
+
formatted_boq += '| ' + ' | '.join(row_vals) + ' |\n'
|
| 224 |
+
|
| 225 |
+
# Clean up formatting
|
| 226 |
+
try:
|
| 227 |
+
s = formatted_boq.replace('\r\n', '\n').replace('\r', '\n')
|
| 228 |
+
lines = [ln.lstrip() for ln in s.split('\n')]
|
| 229 |
+
header_idx = next((i for i, ln in enumerate(lines) if ln.startswith('| ')), None)
|
| 230 |
+
|
| 231 |
+
if header_idx and header_idx > 0 and lines[header_idx - 1].strip():
|
| 232 |
+
lines.insert(header_idx, '')
|
| 233 |
+
|
| 234 |
+
if header_idx:
|
| 235 |
+
sep_idx = header_idx + 1
|
| 236 |
+
if not (sep_idx < len(lines) and re.match(r'^\|\s*-+', lines[sep_idx])):
|
| 237 |
+
cols = [c for c in lines[header_idx].split('|') if c.strip()]
|
| 238 |
+
sep = '|' + '|'.join(['---' for _ in cols]) + '|'
|
| 239 |
+
lines.insert(sep_idx, sep)
|
| 240 |
+
|
| 241 |
+
formatted_boq = '\n'.join(lines).strip() + '\n\n'
|
| 242 |
+
except Exception:
|
| 243 |
+
pass
|
| 244 |
+
|
| 245 |
+
return formatted_boq
|
| 246 |
+
|
| 247 |
+
def extract(self, chunks: List[Document], vector_store: FAISS = None) -> str:
|
| 248 |
+
"""
|
| 249 |
+
Extract BOQ from document chunks.
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
chunks: List of Document chunks.
|
| 253 |
+
vector_store: Optional vector store (not used currently).
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
Formatted BOQ output as markdown string.
|
| 257 |
+
"""
|
| 258 |
+
try:
|
| 259 |
+
logger.info(f'Starting comprehensive BOQ extraction from {len(chunks)} chunks')
|
| 260 |
+
|
| 261 |
+
# Extract metadata
|
| 262 |
+
logger.info('Extracting document metadata...')
|
| 263 |
+
metadata_result = self._extract_metadata(chunks)
|
| 264 |
+
|
| 265 |
+
# Create batches
|
| 266 |
+
logger.info('Creating batches...')
|
| 267 |
+
batches = self._batch_chunks(chunks)
|
| 268 |
+
logger.info(f'Created {len(batches)} batches')
|
| 269 |
+
|
| 270 |
+
# Extract from each batch
|
| 271 |
+
boq_items = []
|
| 272 |
+
for batch_num, batch_chunks in enumerate(batches, 1):
|
| 273 |
+
logger.info(f'Processing batch {batch_num}/{len(batches)} ({len(batch_chunks)} chunks)')
|
| 274 |
+
|
| 275 |
+
chunk_texts = [chunk.page_content for chunk in batch_chunks]
|
| 276 |
+
batch_text = '\n\n'.join(chunk_texts)
|
| 277 |
+
logger.info(f'Batch text length: {len(batch_text)}')
|
| 278 |
+
|
| 279 |
+
batch_items = self._extract_from_batch(batch_text, batch_num)
|
| 280 |
+
boq_items.extend(batch_items)
|
| 281 |
+
logger.info(f'Batch {batch_num} yielded {len(batch_items)} items')
|
| 282 |
+
|
| 283 |
+
# Deduplicate
|
| 284 |
+
unique_items = list(dict.fromkeys(boq_items))
|
| 285 |
+
logger.info(f'Found {len(unique_items)} unique BOQ items after deduplication')
|
| 286 |
+
|
| 287 |
+
# Format output
|
| 288 |
+
logger.info('Formatting BOQ output...')
|
| 289 |
+
formatted_boq = self._format_output(unique_items, metadata_result)
|
| 290 |
+
|
| 291 |
+
logger.info('Comprehensive BOQ extraction completed successfully')
|
| 292 |
+
return formatted_boq
|
| 293 |
+
|
| 294 |
+
except Exception as e:
|
| 295 |
+
logger.error(f'Error in comprehensive BOQ extraction: {e}')
|
| 296 |
+
raise
|
services/consistency.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Consistency checking service for evaluating BOQ extraction reliability.
|
| 3 |
+
"""
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
from difflib import SequenceMatcher
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from langchain_core.documents import Document
|
| 8 |
+
from langchain_community.vectorstores import FAISS
|
| 9 |
+
|
| 10 |
+
from config.settings import settings
|
| 11 |
+
from services.boq_extractor import BOQExtractor
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ConsistencyChecker:
|
| 15 |
+
"""
|
| 16 |
+
Checks consistency of BOQ extractions across multiple runs.
|
| 17 |
+
|
| 18 |
+
Example:
|
| 19 |
+
checker = ConsistencyChecker()
|
| 20 |
+
result = checker.check(chunks, vector_store, runs=4)
|
| 21 |
+
print(f"Consistency: {result['consistency_score']}%")
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, boq_extractor: BOQExtractor = None, default_runs: int = None, low_threshold: float = None):
|
| 25 |
+
"""
|
| 26 |
+
Initialize consistency checker.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
boq_extractor: BOQ extractor instance. Creates new one if not provided.
|
| 30 |
+
default_runs: Default number of extraction runs. Defaults to config value.
|
| 31 |
+
low_threshold: Threshold for low consistency warning. Defaults to config value.
|
| 32 |
+
"""
|
| 33 |
+
self.boq_extractor = boq_extractor or BOQExtractor()
|
| 34 |
+
self.default_runs = default_runs or settings.consistency.default_runs
|
| 35 |
+
self.low_threshold = low_threshold or settings.consistency.low_consistency_threshold
|
| 36 |
+
|
| 37 |
+
def _calculate_similarity(self, results: List[str]) -> float:
|
| 38 |
+
"""
|
| 39 |
+
Calculate average pairwise similarity between results.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
results: List of BOQ extraction results.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Average similarity score (0.0 to 1.0).
|
| 46 |
+
"""
|
| 47 |
+
similarities = []
|
| 48 |
+
|
| 49 |
+
for i in range(len(results)):
|
| 50 |
+
for j in range(i + 1, len(results)):
|
| 51 |
+
if results[i] and results[j]:
|
| 52 |
+
sim = SequenceMatcher(None, results[i], results[j]).ratio()
|
| 53 |
+
similarities.append(sim)
|
| 54 |
+
|
| 55 |
+
return sum(similarities) / len(similarities) if similarities else 0
|
| 56 |
+
|
| 57 |
+
def _extract_confidence_scores(self, boq: str) -> List[float]:
|
| 58 |
+
"""
|
| 59 |
+
Extract confidence scores from BOQ output.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
boq: Formatted BOQ output string.
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
List of confidence score values.
|
| 66 |
+
"""
|
| 67 |
+
if not boq:
|
| 68 |
+
return []
|
| 69 |
+
|
| 70 |
+
lines = boq.split('\n')
|
| 71 |
+
confidence_idx = None
|
| 72 |
+
confidences = []
|
| 73 |
+
|
| 74 |
+
# Find confidence column index from header
|
| 75 |
+
for line in lines:
|
| 76 |
+
line = line.strip()
|
| 77 |
+
if '|' in line and 'Confidence' in line and not line.startswith('| ---'):
|
| 78 |
+
parts = [p.strip() for p in line.split('|')[1:-1]]
|
| 79 |
+
confidence_idx = next(
|
| 80 |
+
(i for i, p in enumerate(parts) if 'Confidence' in p),
|
| 81 |
+
None
|
| 82 |
+
)
|
| 83 |
+
if confidence_idx is not None:
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
if confidence_idx is None:
|
| 87 |
+
return []
|
| 88 |
+
|
| 89 |
+
# Extract confidence values from data rows
|
| 90 |
+
for line in lines:
|
| 91 |
+
if '|' in line and not line.startswith('| ---') and 'Confidence' not in line:
|
| 92 |
+
parts = [p.strip() for p in line.split('|')[1:-1]]
|
| 93 |
+
if len(parts) > confidence_idx:
|
| 94 |
+
try:
|
| 95 |
+
conf_str = parts[confidence_idx]
|
| 96 |
+
if conf_str and conf_str != 'NA' and conf_str != 'N/A':
|
| 97 |
+
conf_str = conf_str.rstrip('%')
|
| 98 |
+
conf = float(conf_str)
|
| 99 |
+
confidences.append(conf)
|
| 100 |
+
except (ValueError, IndexError):
|
| 101 |
+
pass
|
| 102 |
+
|
| 103 |
+
return confidences
|
| 104 |
+
|
| 105 |
+
def check(self, chunks: List[Document], vector_store: FAISS, runs: int = None) -> Dict[str, Any]:
|
| 106 |
+
"""
|
| 107 |
+
Run multiple BOQ extractions and compute consistency metrics.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
chunks: Document chunks to extract from.
|
| 111 |
+
vector_store: Vector store (passed to extractor).
|
| 112 |
+
runs: Number of extraction runs. Defaults to config value.
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
Dictionary with consistency metrics:
|
| 116 |
+
- consistency_score: Overall consistency percentage
|
| 117 |
+
- runs: Number of runs attempted
|
| 118 |
+
- successful_runs: Number of successful runs
|
| 119 |
+
- avg_similarity: Average pairwise similarity
|
| 120 |
+
- avg_confidence: Average confidence score
|
| 121 |
+
- total_confidence_scores: Number of confidence scores found
|
| 122 |
+
- is_low_consistency: Whether consistency is below threshold
|
| 123 |
+
"""
|
| 124 |
+
runs = runs or self.default_runs
|
| 125 |
+
logger.info(f'Starting consistency check with {runs} runs')
|
| 126 |
+
|
| 127 |
+
results = []
|
| 128 |
+
for run_num in range(runs):
|
| 129 |
+
try:
|
| 130 |
+
logger.info(f'Consistency run {run_num + 1}/{runs}')
|
| 131 |
+
boq = self.boq_extractor.extract(chunks, vector_store)
|
| 132 |
+
results.append(boq)
|
| 133 |
+
except Exception as e:
|
| 134 |
+
logger.warning(f"Consistency run {run_num + 1} failed: {e}")
|
| 135 |
+
results.append("")
|
| 136 |
+
|
| 137 |
+
# Calculate similarity
|
| 138 |
+
avg_similarity = self._calculate_similarity(results)
|
| 139 |
+
consistency_score = avg_similarity * 100
|
| 140 |
+
|
| 141 |
+
# Extract and average confidence scores
|
| 142 |
+
all_confidences = []
|
| 143 |
+
for boq in results:
|
| 144 |
+
confidences = self._extract_confidence_scores(boq)
|
| 145 |
+
all_confidences.extend(confidences)
|
| 146 |
+
|
| 147 |
+
avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0
|
| 148 |
+
successful_runs = len([r for r in results if r])
|
| 149 |
+
|
| 150 |
+
result = {
|
| 151 |
+
"consistency_score": round(consistency_score, 2),
|
| 152 |
+
"runs": runs,
|
| 153 |
+
"successful_runs": successful_runs,
|
| 154 |
+
"avg_similarity": round(avg_similarity, 2),
|
| 155 |
+
"avg_confidence": round(avg_confidence, 2),
|
| 156 |
+
"total_confidence_scores": len(all_confidences),
|
| 157 |
+
"is_low_consistency": consistency_score < self.low_threshold
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
logger.info(f'Consistency check completed: {result}')
|
| 161 |
+
return result
|
streamlit_app.py
CHANGED
|
@@ -1,143 +1,310 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import tempfile
|
| 3 |
-
import
|
| 4 |
-
import
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
try:
|
| 45 |
-
#
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
# Store in session
|
| 54 |
-
st.session_state.qa_chain = qa_chain
|
| 55 |
-
st.session_state.extracted_boq = extracted_boq
|
| 56 |
st.session_state.chunks = chunks
|
| 57 |
st.session_state.vector_store = vector_store
|
| 58 |
-
st.session_state.
|
| 59 |
-
|
| 60 |
-
st.
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
st.error(f"Error processing PDF: {error_msg}")
|
| 67 |
finally:
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
# Parse and display the extracted BOQ with better formatting
|
| 75 |
-
boq_text = st.session_state.extracted_boq
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
#
|
| 83 |
-
st.
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
# Add consistency check button
|
| 91 |
-
if st.button("π Check BOQ Reliability"):
|
| 92 |
-
with st.spinner("Running consistency check..."):
|
| 93 |
try:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
st.
|
| 98 |
-
st.
|
| 99 |
-
|
| 100 |
-
st.warning("β οΈ Low consistency detected. LLM outputs vary significantlyβconsider reviewing extractions.")
|
| 101 |
except Exception as e:
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
st.divider()
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
-
# Display previous chat messages
|
| 112 |
-
for message in st.session_state.messages:
|
| 113 |
-
with st.chat_message(message["role"]):
|
| 114 |
-
st.markdown(message["content"])
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
-
with st.chat_message("user"):
|
| 124 |
-
st.markdown(prompt)
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
-
st.session_state.messages.append(
|
| 139 |
-
{"role": "assistant", "content": answer}
|
| 140 |
-
)
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BOQTenders Streamlit Application
|
| 3 |
+
|
| 4 |
+
Interactive web interface for BOQ extraction and document chat.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
streamlit run streamlit_app_new.py
|
| 8 |
+
"""
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
# Add project root to path
|
| 13 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 14 |
+
|
| 15 |
import tempfile
|
| 16 |
+
import streamlit as st
|
| 17 |
+
from loguru import logger
|
| 18 |
+
|
| 19 |
+
from config.settings import settings
|
| 20 |
+
from core.pdf_extractor import PDFExtractor
|
| 21 |
+
from core.embeddings import EmbeddingService
|
| 22 |
+
from core.rag_chain import RAGChainBuilder
|
| 23 |
+
from services.boq_extractor import BOQExtractor
|
| 24 |
+
from services.consistency import ConsistencyChecker
|
| 25 |
+
|
| 26 |
+
# Configure logging
|
| 27 |
+
logger.remove()
|
| 28 |
+
logger.add(
|
| 29 |
+
sys.stderr,
|
| 30 |
+
level=settings.log_level,
|
| 31 |
+
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def initialize_services():
|
| 36 |
+
"""Initialize all services (cached)."""
|
| 37 |
+
if "services_initialized" not in st.session_state:
|
| 38 |
+
st.session_state.pdf_extractor = PDFExtractor()
|
| 39 |
+
st.session_state.embedding_service = EmbeddingService()
|
| 40 |
+
st.session_state.rag_builder = RAGChainBuilder()
|
| 41 |
+
st.session_state.boq_extractor = BOQExtractor()
|
| 42 |
+
st.session_state.consistency_checker = ConsistencyChecker()
|
| 43 |
+
st.session_state.services_initialized = True
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def initialize_session_state():
|
| 47 |
+
"""Initialize Streamlit session state variables."""
|
| 48 |
+
defaults = {
|
| 49 |
+
"boq_output": None,
|
| 50 |
+
"qa_chain": None,
|
| 51 |
+
"vector_store": None,
|
| 52 |
+
"chunks": None,
|
| 53 |
+
"chat_history": [],
|
| 54 |
+
"document_loaded": False,
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
for key, value in defaults.items():
|
| 58 |
+
if key not in st.session_state:
|
| 59 |
+
st.session_state[key] = value
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def process_pdf(uploaded_file) -> bool:
|
| 63 |
+
"""
|
| 64 |
+
Process uploaded PDF file.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
True if processing succeeded, False otherwise.
|
| 68 |
+
"""
|
| 69 |
+
try:
|
| 70 |
+
with st.spinner("Processing PDF..."):
|
| 71 |
+
# Save to temp file
|
| 72 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
| 73 |
+
temp_file.write(uploaded_file.getvalue())
|
| 74 |
+
temp_path = temp_file.name
|
| 75 |
|
| 76 |
try:
|
| 77 |
+
# Extract text
|
| 78 |
+
st.info("Extracting text from PDF...")
|
| 79 |
+
text = st.session_state.pdf_extractor.extract_text(temp_path, filename=uploaded_file.name)
|
| 80 |
+
|
| 81 |
+
if not text:
|
| 82 |
+
st.error("Could not extract text from PDF")
|
| 83 |
+
return False
|
| 84 |
+
|
| 85 |
+
# Create embeddings
|
| 86 |
+
st.info("Creating embeddings...")
|
| 87 |
+
chunks = st.session_state.embedding_service.split_text(text)
|
| 88 |
+
vector_store = st.session_state.embedding_service.create_vector_store(chunks)
|
| 89 |
|
| 90 |
+
# Extract BOQ
|
| 91 |
+
st.info("Extracting BOQ items...")
|
| 92 |
+
boq_output = st.session_state.boq_extractor.extract(chunks, vector_store)
|
| 93 |
+
|
| 94 |
+
# Build QA chain
|
| 95 |
+
st.info("Building chat interface...")
|
| 96 |
+
qa_chain = st.session_state.rag_builder.build(vector_store)
|
| 97 |
|
| 98 |
# Store in session
|
|
|
|
|
|
|
| 99 |
st.session_state.chunks = chunks
|
| 100 |
st.session_state.vector_store = vector_store
|
| 101 |
+
st.session_state.boq_output = boq_output
|
| 102 |
+
st.session_state.qa_chain = qa_chain
|
| 103 |
+
st.session_state.document_loaded = True
|
| 104 |
+
st.session_state.chat_history = []
|
| 105 |
+
|
| 106 |
+
st.success(f"β
Processed {len(chunks)} document chunks")
|
| 107 |
+
return True
|
| 108 |
+
|
|
|
|
| 109 |
finally:
|
| 110 |
+
Path(temp_path).unlink(missing_ok=True)
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error(f"Error processing PDF: {e}")
|
| 114 |
+
st.error(f"Error processing PDF: {str(e)}")
|
| 115 |
+
return False
|
| 116 |
|
| 117 |
+
|
| 118 |
+
def render_chat_interface():
|
| 119 |
+
"""Render the chat interface."""
|
| 120 |
+
st.subheader("π¬ Chat with Document")
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
if not st.session_state.document_loaded:
|
| 123 |
+
st.info("Please upload a PDF to enable chat")
|
| 124 |
+
return
|
| 125 |
|
| 126 |
+
# Chat history
|
| 127 |
+
for message in st.session_state.chat_history:
|
| 128 |
+
role = message["role"]
|
| 129 |
+
content = message["content"]
|
| 130 |
+
|
| 131 |
+
if role == "user":
|
| 132 |
+
st.chat_message("user").write(content)
|
| 133 |
+
else:
|
| 134 |
+
st.chat_message("assistant").write(content)
|
| 135 |
|
| 136 |
+
# Chat input
|
| 137 |
+
if prompt := st.chat_input("Ask a question about the document..."):
|
| 138 |
+
st.session_state.chat_history.append({"role": "user", "content": prompt})
|
| 139 |
+
st.chat_message("user").write(prompt)
|
| 140 |
+
|
| 141 |
+
with st.spinner("Thinking..."):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
try:
|
| 143 |
+
response = st.session_state.qa_chain({"question": prompt})
|
| 144 |
+
answer = response.get("answer", "I couldn't find an answer.")
|
| 145 |
+
|
| 146 |
+
st.session_state.chat_history.append({"role": "assistant", "content": answer})
|
| 147 |
+
st.chat_message("assistant").write(answer)
|
| 148 |
+
|
|
|
|
| 149 |
except Exception as e:
|
| 150 |
+
logger.error(f"Chat error: {e}")
|
| 151 |
+
error_msg = f"Error: {str(e)}"
|
| 152 |
+
st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
|
| 153 |
+
st.chat_message("assistant").write(error_msg)
|
| 154 |
|
|
|
|
| 155 |
|
| 156 |
+
def render_boq_output():
|
| 157 |
+
"""Render the BOQ output."""
|
| 158 |
+
st.subheader("π Extracted BOQ")
|
| 159 |
+
|
| 160 |
+
if st.session_state.boq_output:
|
| 161 |
+
st.markdown(st.session_state.boq_output)
|
| 162 |
+
|
| 163 |
+
# Download button
|
| 164 |
+
st.download_button(
|
| 165 |
+
label="π₯ Download BOQ as Markdown",
|
| 166 |
+
data=st.session_state.boq_output,
|
| 167 |
+
file_name="boq_output.md",
|
| 168 |
+
mime="text/markdown"
|
| 169 |
+
)
|
| 170 |
+
else:
|
| 171 |
+
st.info("Upload a PDF to see extracted BOQ items")
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
+
def render_consistency_check():
|
| 175 |
+
"""Render consistency check interface."""
|
| 176 |
+
st.subheader("π Consistency Check")
|
| 177 |
+
|
| 178 |
+
if not st.session_state.document_loaded:
|
| 179 |
+
st.info("Upload a PDF to run consistency checks")
|
| 180 |
+
return
|
| 181 |
+
|
| 182 |
+
runs = st.number_input(
|
| 183 |
+
"Number of extraction runs",
|
| 184 |
+
min_value=2,
|
| 185 |
+
max_value=10,
|
| 186 |
+
value=settings.consistency.default_runs,
|
| 187 |
+
step=1
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
if st.button("Run Consistency Check"):
|
| 191 |
+
with st.spinner(f"Running {runs} extraction passes..."):
|
| 192 |
+
try:
|
| 193 |
+
result = st.session_state.consistency_checker.check(
|
| 194 |
+
chunks=st.session_state.chunks,
|
| 195 |
+
vector_store=st.session_state.vector_store,
|
| 196 |
+
runs=runs
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
col1, col2, col3 = st.columns(3)
|
| 200 |
+
|
| 201 |
+
with col1:
|
| 202 |
+
st.metric("Consistency Score", f"{result['consistency_score']:.1f}%")
|
| 203 |
+
|
| 204 |
+
with col2:
|
| 205 |
+
st.metric("Avg Confidence", f"{result['avg_confidence']:.1f}%")
|
| 206 |
+
|
| 207 |
+
with col3:
|
| 208 |
+
st.metric("Successful Runs", f"{result['successful_runs']}/{result['runs']}")
|
| 209 |
+
|
| 210 |
+
if result['is_low_consistency']:
|
| 211 |
+
st.warning("β οΈ Low consistency detected. Results may vary.")
|
| 212 |
+
else:
|
| 213 |
+
st.success("β
Good consistency across extraction runs")
|
| 214 |
+
|
| 215 |
+
except Exception as e:
|
| 216 |
+
logger.error(f"Consistency check error: {e}")
|
| 217 |
+
st.error(f"Error: {str(e)}")
|
| 218 |
|
|
|
|
|
|
|
| 219 |
|
| 220 |
+
def render_sidebar():
|
| 221 |
+
"""Render the sidebar."""
|
| 222 |
+
with st.sidebar:
|
| 223 |
+
st.title("π BOQ Extractor")
|
| 224 |
+
st.markdown("---")
|
| 225 |
+
|
| 226 |
+
# File upload
|
| 227 |
+
uploaded_file = st.file_uploader(
|
| 228 |
+
"Upload PDF Document",
|
| 229 |
+
type=["pdf"],
|
| 230 |
+
help="Upload a tender/BOQ document for extraction"
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
if uploaded_file:
|
| 234 |
+
if st.button("π Process Document"):
|
| 235 |
+
process_pdf(uploaded_file)
|
| 236 |
+
|
| 237 |
+
st.markdown("---")
|
| 238 |
+
|
| 239 |
+
# Clear session
|
| 240 |
+
if st.button("ποΈ Clear Session"):
|
| 241 |
+
for key in list(st.session_state.keys()):
|
| 242 |
+
if key != "services_initialized":
|
| 243 |
+
del st.session_state[key]
|
| 244 |
+
initialize_session_state()
|
| 245 |
+
st.success("Session cleared!")
|
| 246 |
+
st.rerun()
|
| 247 |
+
|
| 248 |
|
| 249 |
+
def main():
|
| 250 |
+
"""Main application entry point."""
|
| 251 |
+
# Page config
|
| 252 |
+
st.set_page_config(
|
| 253 |
+
page_title=settings.streamlit.page_title,
|
| 254 |
+
page_icon=settings.streamlit.page_icon,
|
| 255 |
+
layout=settings.streamlit.layout,
|
| 256 |
+
initial_sidebar_state="expanded"
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# Add CSS for sticky tabs
|
| 260 |
+
st.markdown("""
|
| 261 |
+
<style>
|
| 262 |
+
/* Make tabs sticky at top */
|
| 263 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 264 |
+
position: sticky;
|
| 265 |
+
top: 0;
|
| 266 |
+
background-color: white;
|
| 267 |
+
z-index: 999;
|
| 268 |
+
padding-top: 1rem;
|
| 269 |
+
padding-bottom: 0.5rem;
|
| 270 |
+
border-bottom: 1px solid #e6e6e6;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
/* Dark mode support */
|
| 274 |
+
@media (prefers-color-scheme: dark) {
|
| 275 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 276 |
+
background-color: #0e1117;
|
| 277 |
+
border-bottom: 1px solid #333;
|
| 278 |
+
}
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
/* Streamlit dark theme */
|
| 282 |
+
[data-theme="dark"] .stTabs [data-baseweb="tab-list"] {
|
| 283 |
+
background-color: #0e1117;
|
| 284 |
+
border-bottom: 1px solid #333;
|
| 285 |
+
}
|
| 286 |
+
</style>
|
| 287 |
+
""", unsafe_allow_html=True)
|
| 288 |
+
|
| 289 |
+
# Initialize
|
| 290 |
+
initialize_services()
|
| 291 |
+
initialize_session_state()
|
| 292 |
+
|
| 293 |
+
# Render sidebar
|
| 294 |
+
render_sidebar()
|
| 295 |
+
|
| 296 |
+
# Main content tabs
|
| 297 |
+
tab1, tab2, tab3 = st.tabs(["π BOQ Output", "π¬ Chat", "π Analysis"])
|
| 298 |
+
|
| 299 |
+
with tab1:
|
| 300 |
+
render_boq_output()
|
| 301 |
+
|
| 302 |
+
with tab2:
|
| 303 |
+
render_chat_interface()
|
| 304 |
+
|
| 305 |
+
with tab3:
|
| 306 |
+
render_consistency_check()
|
| 307 |
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
+
if __name__ == "__main__":
|
| 310 |
+
main()
|