Sahil Garg commited on
Commit
67dfd9a
Β·
1 Parent(s): 6968c5c

modularization and configuration

Browse files
.env.example ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # BOQTenders Environment Configuration
2
+ # Copy this file to .env and fill in your API keys
3
+
4
+ # Required API Keys
5
+ GOOGLE_API_KEY=your_google_api_key_here
6
+ HF_API_TOKEN=your_huggingface_api_token_here
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  __pycache__/
 
2
  *.pyc
3
  *.pyo
4
  *.pyd
 
1
  __pycache__/
2
+ **/__pycache__/
3
  *.pyc
4
  *.pyo
5
  *.pyd
README.md CHANGED
Binary files a/README.md and b/README.md differ
 
api/routes.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI routes for BOQ extraction API.
3
+ """
4
+ import tempfile
5
+ from typing import Optional
6
+ from pathlib import Path
7
+
8
+ from fastapi import FastAPI, File, UploadFile, HTTPException
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from loguru import logger
11
+
12
+ from config.settings import settings
13
+ from core.pdf_extractor import PDFExtractor
14
+ from core.embeddings import EmbeddingService
15
+ from core.rag_chain import RAGChainBuilder
16
+ from services.boq_extractor import BOQExtractor
17
+ from services.consistency import ConsistencyChecker
18
+ from api.schemas import (
19
+ ChatRequest,
20
+ ChatResponse,
21
+ UploadResponse,
22
+ ConsistencyResponse,
23
+ ErrorResponse,
24
+ )
25
+
26
+
27
+ # Initialize FastAPI app
28
+ app = FastAPI(
29
+ title=settings.api.title,
30
+ description=settings.api.description,
31
+ version=settings.api.version,
32
+ docs_url="/docs" if settings.api.docs_enabled else None,
33
+ redoc_url="/redoc" if settings.api.docs_enabled else None,
34
+ )
35
+
36
+ # Configure CORS
37
+ app.add_middleware(
38
+ CORSMiddleware,
39
+ allow_origins=settings.api.cors_origins,
40
+ allow_credentials=True,
41
+ allow_methods=["*"],
42
+ allow_headers=["*"],
43
+ )
44
+
45
+ # Global state for session data
46
+ _session_state = {
47
+ "qa_chain": None,
48
+ "vector_store": None,
49
+ "chunks": None,
50
+ }
51
+
52
+ # Initialize services
53
+ pdf_extractor = PDFExtractor()
54
+ embedding_service = EmbeddingService()
55
+ rag_builder = RAGChainBuilder()
56
+ boq_extractor = BOQExtractor()
57
+ consistency_checker = ConsistencyChecker(boq_extractor=boq_extractor)
58
+
59
+ # Expose router for external use
60
+ router = app.router
61
+
62
+
63
+ @app.post(
64
+ "/upload",
65
+ response_model=UploadResponse,
66
+ responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}},
67
+ tags=["Documents"]
68
+ )
69
+ async def upload_pdf(file: UploadFile = File(...)):
70
+ """
71
+ Upload a PDF file for BOQ extraction.
72
+
73
+ - Extracts text from PDF
74
+ - Creates embeddings and vector store
75
+ - Extracts BOQ items
76
+ - Sets up QA chain for chat
77
+ """
78
+ global _session_state
79
+
80
+ if not file:
81
+ raise HTTPException(status_code=400, detail="No file uploaded")
82
+
83
+ if not file.filename.lower().endswith('.pdf'):
84
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
85
+
86
+ try:
87
+ logger.info(f'Processing uploaded file: {file.filename}')
88
+
89
+ # Save to temp file
90
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
91
+ content = await file.read()
92
+ temp_file.write(content)
93
+ temp_path = temp_file.name
94
+
95
+ try:
96
+ # Extract text from PDF
97
+ logger.info('Extracting text from PDF...')
98
+ text = pdf_extractor.extract_text(temp_path, filename=file.filename)
99
+
100
+ if not text:
101
+ raise HTTPException(
102
+ status_code=400,
103
+ detail="Could not extract text from PDF"
104
+ )
105
+
106
+ # Create chunks and vector store
107
+ logger.info('Creating embeddings...')
108
+ chunks = embedding_service.split_text(text)
109
+ vector_store = embedding_service.create_vector_store(chunks)
110
+
111
+ # Extract BOQ
112
+ logger.info('Extracting BOQ...')
113
+ boq_output = boq_extractor.extract(chunks, vector_store)
114
+
115
+ # Build QA chain
116
+ logger.info('Building QA chain...')
117
+ qa_chain = rag_builder.build(vector_store)
118
+
119
+ # Store in session state
120
+ _session_state["qa_chain"] = qa_chain
121
+ _session_state["vector_store"] = vector_store
122
+ _session_state["chunks"] = chunks
123
+
124
+ logger.info(f'Upload completed: {len(chunks)} chunks created')
125
+
126
+ return UploadResponse(
127
+ message="success",
128
+ output=boq_output
129
+ )
130
+
131
+ finally:
132
+ # Clean up temp file
133
+ Path(temp_path).unlink(missing_ok=True)
134
+
135
+ except HTTPException:
136
+ raise
137
+ except Exception as e:
138
+ logger.error(f'Error processing upload: {e}')
139
+ raise HTTPException(status_code=500, detail=str(e))
140
+
141
+
142
+ @app.post(
143
+ "/chat",
144
+ response_model=ChatResponse,
145
+ responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}},
146
+ tags=["Chat"]
147
+ )
148
+ async def chat(request: ChatRequest):
149
+ """
150
+ Ask a question about the uploaded document.
151
+
152
+ Requires a document to be uploaded first via /upload endpoint.
153
+ """
154
+ global _session_state
155
+
156
+ if not _session_state.get("qa_chain"):
157
+ raise HTTPException(
158
+ status_code=400,
159
+ detail="No document loaded. Please upload a PDF first."
160
+ )
161
+
162
+ try:
163
+ logger.info(f'Processing chat question: {request.question[:50]}...')
164
+
165
+ qa_chain = _session_state["qa_chain"]
166
+
167
+ # Get response from QA chain (using old LangChain API)
168
+ response = qa_chain({"question": request.question})
169
+
170
+ answer = response.get("answer", "")
171
+
172
+ logger.info('Chat response generated')
173
+
174
+ return ChatResponse(answer=answer)
175
+
176
+ except Exception as e:
177
+ logger.error(f'Error processing chat: {e}')
178
+ raise HTTPException(status_code=500, detail=str(e))
179
+
180
+
181
+ @app.post(
182
+ "/consistency",
183
+ response_model=ConsistencyResponse,
184
+ responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}},
185
+ tags=["Analysis"]
186
+ )
187
+ async def check_consistency(runs: int = 4):
188
+ """
189
+ Check extraction consistency by running multiple extractions.
190
+
191
+ Requires a document to be uploaded first via /upload endpoint.
192
+
193
+ Args:
194
+ runs: Number of extraction runs (default: 4)
195
+ """
196
+ global _session_state
197
+
198
+ if not _session_state.get("chunks"):
199
+ raise HTTPException(
200
+ status_code=400,
201
+ detail="No document loaded. Please upload a PDF first."
202
+ )
203
+
204
+ if runs < 2:
205
+ raise HTTPException(
206
+ status_code=400,
207
+ detail="At least 2 runs required for consistency check"
208
+ )
209
+
210
+ if runs > 10:
211
+ raise HTTPException(
212
+ status_code=400,
213
+ detail="Maximum 10 runs allowed"
214
+ )
215
+
216
+ try:
217
+ logger.info(f'Running consistency check with {runs} runs')
218
+
219
+ result = consistency_checker.check(
220
+ chunks=_session_state["chunks"],
221
+ vector_store=_session_state["vector_store"],
222
+ runs=runs
223
+ )
224
+
225
+ return ConsistencyResponse(
226
+ consistency_score=result.get("consistency_score"),
227
+ successful_runs=result.get("successful_runs"),
228
+ avg_confidence=result.get("avg_confidence"),
229
+ is_low_consistency=result.get("is_low_consistency")
230
+ )
231
+
232
+ except Exception as e:
233
+ logger.error(f'Error in consistency check: {e}')
234
+ raise HTTPException(status_code=500, detail=str(e))
235
+
236
+
237
+ @app.get(
238
+ "/clear",
239
+ tags=["Session"]
240
+ )
241
+ async def clear_session():
242
+ """Clear the current session state."""
243
+ global _session_state
244
+
245
+ _session_state = {
246
+ "qa_chain": None,
247
+ "vector_store": None,
248
+ "chunks": None,
249
+ }
250
+
251
+ logger.info('Session cleared')
252
+ return {"message": "Session cleared"}
api/schemas.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pydantic schemas for API request/response validation.
3
+ """
4
+ from typing import Optional, List, Dict, Any
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class ChatRequest(BaseModel):
9
+ """Chat endpoint request schema."""
10
+ question: str = Field(..., description="Question to ask about the document")
11
+
12
+ class Config:
13
+ json_schema_extra = {
14
+ "example": {
15
+ "question": "What is the total quantity of steel required?"
16
+ }
17
+ }
18
+
19
+
20
+ class ChatResponse(BaseModel):
21
+ """Chat endpoint response schema."""
22
+ answer: str = Field(..., description="Answer to the question")
23
+
24
+ class Config:
25
+ json_schema_extra = {
26
+ "example": {
27
+ "answer": "The total quantity of steel required is 500 MT."
28
+ }
29
+ }
30
+
31
+
32
+ class UploadResponse(BaseModel):
33
+ """Upload endpoint response schema."""
34
+ message: str = Field(..., description="Status message")
35
+ output: str = Field(..., description="Extracted BOQ in markdown format")
36
+
37
+ class Config:
38
+ json_schema_extra = {
39
+ "example": {
40
+ "message": "success",
41
+ "output": "## DOCUMENT SUMMARY\n..."
42
+ }
43
+ }
44
+
45
+
46
+ class ConsistencyResponse(BaseModel):
47
+ """Consistency check endpoint response schema."""
48
+ consistency_score: float = Field(
49
+ ...,
50
+ ge=0,
51
+ le=100,
52
+ description="Consistency score as percentage (0-100)"
53
+ )
54
+ successful_runs: int = Field(..., description="Number of successful runs")
55
+ avg_confidence: float = Field(
56
+ ...,
57
+ ge=0,
58
+ le=100,
59
+ description="Average confidence score"
60
+ )
61
+ is_low_consistency: bool = Field(
62
+ ...,
63
+ description="Whether consistency is below threshold"
64
+ )
65
+
66
+ class Config:
67
+ json_schema_extra = {
68
+ "example": {
69
+ "consistency_score": 92.5,
70
+ "successful_runs": 4,
71
+ "avg_confidence": 85.2,
72
+ "is_low_consistency": False
73
+ }
74
+ }
75
+
76
+
77
+ class BOQResponse(BaseModel):
78
+ """BOQ extraction response schema."""
79
+ boq_output: str = Field(..., description="Extracted BOQ in markdown format")
80
+ items_count: int = Field(default=0, description="Number of BOQ items extracted")
81
+
82
+ class Config:
83
+ json_schema_extra = {
84
+ "example": {
85
+ "boq_output": "## DOCUMENT SUMMARY\n...",
86
+ "items_count": 25
87
+ }
88
+ }
89
+
90
+
91
+ class ErrorResponse(BaseModel):
92
+ """Error response schema."""
93
+ error: str = Field(..., description="Error type")
94
+ message: str = Field(..., description="Error message")
95
+ detail: Optional[str] = Field(default=None, description="Additional error details")
96
+
97
+ class Config:
98
+ json_schema_extra = {
99
+ "example": {
100
+ "error": "ValidationError",
101
+ "message": "No file uploaded",
102
+ "detail": "Please upload a PDF file"
103
+ }
104
+ }
app.py CHANGED
@@ -1,122 +1,39 @@
1
- from fastapi import FastAPI, HTTPException, UploadFile, File
2
- from pydantic import BaseModel
3
- import boq_processor
4
- from loguru import logger
5
- import os
6
- import shutil
7
- from typing import Dict, Any
8
 
9
- app = FastAPI(title="BOQ Chatbot API", description="API for extracting and querying BOQ from tender PDFs using RAG+CAG")
10
 
11
- # Global variables for chain and vector store
12
- vector_store = None
13
- qa_chain = None
14
- chunks = None
 
15
 
16
- class ChatRequest(BaseModel):
17
- question: str
18
 
19
- @app.post("/upload", summary="Upload PDF", description="Upload a PDF file to initialize the BOQ chatbot.")
20
- async def upload_pdf(file: UploadFile = File(...)) -> Dict[str, Any]:
21
- global vector_store, qa_chain
22
- if not file.filename.endswith(".pdf"):
23
- logger.warning(f"Invalid file type uploaded: {file.filename}")
24
- raise HTTPException(status_code=400, detail="Only PDF files are allowed")
25
-
26
- # Save uploaded file
27
- pdf_path = f"temp_{file.filename}"
28
- try:
29
- with open(pdf_path, "wb") as buffer:
30
- shutil.copyfileobj(file.file, buffer)
31
-
32
- logger.info(f"Processing uploaded PDF: {file.filename}")
33
- global chunks
34
- chunks = boq_processor.load_and_process_pdf(pdf_path, filename=file.filename)
35
- vector_store = boq_processor.create_vector_store(chunks)
36
- qa_chain = boq_processor.setup_rag_chain(vector_store)
37
-
38
- # Use comprehensive extraction for complete BOQ coverage
39
- extracted_boq = boq_processor.extract_boq_comprehensive(chunks, vector_store)
40
-
41
- logger.info("PDF uploaded and processed successfully")
42
-
43
- # Return enriched response with processing metadata
44
- return {
45
- "status": "success",
46
- "message": "PDF uploaded and processed successfully",
47
- "file_name": file.filename,
48
- "processing_info": {
49
- "documents_loaded": len(chunks),
50
- "chunks_created": len(chunks),
51
- "vector_store_ready": vector_store is not None,
52
- "rag_chain_ready": qa_chain is not None
53
- },
54
- "extracted_boq": extracted_boq
55
- }
56
- except Exception as e:
57
- error_msg = str(e)
58
- logger.error(f"Error processing PDF: {error_msg}")
59
-
60
- # Check if it's a rate limit error and provide specific guidance
61
- if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg or "quota" in error_msg.lower():
62
- raise HTTPException(
63
- status_code=429,
64
- detail="API rate limit exceeded. Please try again later or upgrade your API plan. See https://ai.google.dev/gemini-api/docs/rate-limits"
65
- )
66
- else:
67
- raise HTTPException(status_code=500, detail="Internal server error")
68
- finally:
69
- if os.path.exists(pdf_path):
70
- os.remove(pdf_path)
71
 
72
- @app.post("/chat", summary="Chat with BOQ", description="Send a question about the uploaded BOQ PDF and get an answer.")
73
- async def chat(request: ChatRequest) -> Dict[str, str]:
74
- if not qa_chain:
75
- logger.warning("Chat attempted without uploaded PDF")
76
- raise HTTPException(status_code=400, detail="No PDF uploaded. Please upload a PDF first using /upload")
77
-
78
- try:
79
- logger.info(f"Processing chat question: {request.question}")
80
-
81
- # Use old LangChain API (0.1.x) directly
82
- result = qa_chain({"question": request.question})
83
-
84
- logger.info("Chat response generated")
85
- return {"answer": result["answer"]}
86
- except Exception as e:
87
- error_msg = str(e)
88
- logger.error(f"Error in chat: {error_msg}")
89
-
90
- # Check if it's a rate limit error and provide specific guidance
91
- if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg or "quota" in error_msg.lower():
92
- raise HTTPException(
93
- status_code=429,
94
- detail="API rate limit exceeded. Please try again later or upgrade your API plan. See https://ai.google.dev/gemini-api/docs/rate-limits"
95
- )
96
- else:
97
- raise HTTPException(status_code=500, detail="Internal server error")
98
 
99
- @app.get("/consistency", summary="Check Consistency", description="Run multiple BOQ extractions to check LLM output consistency.")
100
- async def get_consistency() -> Dict[str, Any]:
101
- global chunks, vector_store
102
- if not chunks or not vector_store:
103
- logger.warning("Consistency check attempted without uploaded PDF")
104
- raise HTTPException(status_code=400, detail="No PDF uploaded. Please upload a PDF first using /upload")
105
 
106
- try:
107
- logger.info("Running consistency check")
108
- consistency_result = boq_processor.check_consistency(chunks, vector_store, runs=4)
109
- logger.info(f"Consistency check completed: {consistency_result}")
110
- return {"status": "success", "consistency": consistency_result}
111
- except Exception as e:
112
- error_msg = str(e)
113
- logger.error(f"Error in consistency check: {error_msg}")
114
-
115
- # Check if it's a rate limit error
116
- if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg or "quota" in error_msg.lower():
117
- raise HTTPException(
118
- status_code=429,
119
- detail="API rate limit exceeded. Please try again later or upgrade your API plan. See https://ai.google.dev/gemini-api/docs/rate-limits"
120
- )
121
- else:
122
- raise HTTPException(status_code=500, detail="Internal server error")
 
1
+ """
2
+ BOQTenders API Server
 
 
 
 
 
3
 
4
+ FastAPI application entry point for BOQ extraction and chat services.
5
 
6
+ Usage:
7
+ uvicorn app:app --host 0.0.0.0 --port 8000 --reload
8
+ """
9
+ import sys
10
+ from pathlib import Path
11
 
12
+ # Add project root to path
13
+ sys.path.insert(0, str(Path(__file__).parent))
14
 
15
+ from loguru import logger
16
+ from config.settings import settings
17
+ from api.routes import app
18
+
19
+ # Configure logging
20
+ logger.remove()
21
+ logger.add(
22
+ sys.stderr,
23
+ level=settings.log_level,
24
+ format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
25
+ )
26
+
27
+ # Export app for uvicorn
28
+ __all__ = ["app"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ if __name__ == "__main__":
32
+ import uvicorn
 
 
 
 
33
 
34
+ uvicorn.run(
35
+ "app:app",
36
+ host=settings.api.host,
37
+ port=settings.api.port,
38
+ reload=settings.api.debug,
39
+ )
 
 
 
 
 
 
 
 
 
 
 
boq_processor.py DELETED
@@ -1,477 +0,0 @@
1
- import os
2
- from typing import List, Optional
3
- import dotenv
4
- from pydantic_settings import BaseSettings
5
- from loguru import logger
6
- from langchain_text_splitters import RecursiveCharacterTextSplitter
7
- from langchain_huggingface import HuggingFaceEmbeddings
8
- from langchain_community.vectorstores import FAISS
9
- from langchain_google_genai import GoogleGenerativeAI
10
- import time
11
- from functools import wraps
12
- import re
13
- import requests
14
-
15
- from langchain_core.documents import Document
16
-
17
- # Load environment variables
18
- dotenv.load_dotenv()
19
-
20
- def retry_with_exponential_backoff(max_retries: int = 3, initial_delay: int = 2, backoff_factor: int = 2):
21
- """Decorator for retry logic with exponential backoff for API rate limits."""
22
- def decorator(func):
23
- @wraps(func)
24
- def wrapper(*args, **kwargs):
25
- last_exception = None
26
- for attempt in range(max_retries + 1):
27
- try:
28
- return func(*args, **kwargs)
29
- except Exception as e:
30
- last_exception = e
31
- error_str = str(e)
32
- is_rate_limit = ("429" in error_str or "RESOURCE_EXHAUSTED" in error_str or "quota" in error_str.lower())
33
- if is_rate_limit and attempt < max_retries:
34
- delay = initial_delay * (backoff_factor ** attempt)
35
- logger.warning(f"Rate limit encountered. Retry {attempt + 1}/{max_retries} in {delay}s: {error_str}")
36
- time.sleep(delay)
37
- else:
38
- if is_rate_limit and attempt == max_retries:
39
- logger.error(f"Rate limit exhausted after {max_retries} retries")
40
- raise
41
- raise last_exception
42
- return wrapper
43
- return decorator
44
-
45
- class Settings(BaseSettings):
46
- google_api_key: str = os.getenv("GOOGLE_API_KEY")
47
- hf_api_token: str = os.getenv("HF_API_TOKEN")
48
- model_name: str = "gemini-2.5-flash-lite"
49
- temperature: float = 0.0
50
- chunk_size: int = 1000
51
- chunk_overlap: int = 500
52
- embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
53
-
54
- settings = Settings()
55
-
56
- class BOQProcessor:
57
- def __init__(self):
58
- self.settings = settings
59
- self.llm = GoogleGenerativeAI(model=self.settings.model_name, temperature=self.settings.temperature)
60
- self.text_splitter = RecursiveCharacterTextSplitter(
61
- chunk_size=self.settings.chunk_size,
62
- chunk_overlap=self.settings.chunk_overlap
63
- )
64
-
65
- def _table_to_markdown(self, table: dict) -> str:
66
- # Commented out: Table extraction not needed for now
67
- """
68
- headers = table.get('headers', [])
69
- rows = table.get('rows', [])
70
- if not headers:
71
- return ''
72
- md = '| ' + ' | '.join(headers) + ' |\n'
73
- md += '|' + '|'.join(['---'] * len(headers)) + '|\n'
74
- for row in rows:
75
- md += '| ' + ' | '.join(str(cell) for cell in row) + ' |\n'
76
- return md
77
- """
78
-
79
- def _call_extract_text_api(self, pdf_path: str, start_page: int = 1, end_page: int = 100, filename: str = None) -> str:
80
- display_name = filename or os.path.basename(pdf_path)
81
- logger.info(f'Starting text extraction for {display_name} (pages {start_page}-{end_page})')
82
- with open(pdf_path, 'rb') as f:
83
- files = {'file': (os.path.basename(pdf_path), f, 'application/pdf')}
84
- data = {'start_page': start_page, 'end_page': end_page, 'filename': os.path.basename(pdf_path)}
85
- headers = {'Authorization': f'Bearer {self.settings.hf_api_token}'}
86
- response = requests.post(
87
- 'https://point9-extract-text-and-table.hf.space/api/text',
88
- files=files,
89
- data=data,
90
- headers=headers
91
- )
92
- response.raise_for_status()
93
- json_response = response.json()
94
- if isinstance(json_response, dict):
95
- result = json_response.get('result', '')
96
- else:
97
- logger.error(f"Unexpected response format: {json_response}")
98
- result = ''
99
- logger.info(f'Text extraction completed, response length: {len(result)}')
100
- return result
101
-
102
- def _call_extract_tables_api(self, pdf_path: str, start_page: int = 1, end_page: int = 2, filename: str = None) -> List[dict]:
103
- # Commented out: Table extraction not needed for now
104
- """
105
- display_name = filename or os.path.basename(pdf_path)
106
- logger.info(f'Starting table extraction for {display_name} (pages {start_page}-{end_page})')
107
- with open(pdf_path, 'rb') as f:
108
- files = {'file': (os.path.basename(pdf_path), f, 'application/pdf')}
109
- data = {'start_page': start_page, 'end_page': end_page, 'filename': os.path.basename(pdf_path)}
110
- headers = {'Authorization': f'Bearer {self.settings.hf_api_token}'}
111
- response = requests.post(
112
- 'https://point9-extract-text-and-table.hf.space/api/tables',
113
- files=files,
114
- data=data,
115
- headers=headers
116
- )
117
- response.raise_for_status()
118
- json_response = response.json()
119
- if isinstance(json_response, dict):
120
- result = json_response.get('result', [])
121
- # Filter to only include valid table dicts
122
- valid_tables = [t for t in result if isinstance(t, dict)]
123
- invalid_count = len(result) - len(valid_tables)
124
- if invalid_count > 0:
125
- logger.warning(f"Filtered out {invalid_count} invalid tables (not dicts)")
126
- result = valid_tables
127
- else:
128
- logger.error(f"Unexpected response format: {json_response}")
129
- result = []
130
- logger.info(f'Table extraction completed, found {len(result)} valid tables')
131
- return result
132
- """
133
-
134
- def load_and_process_pdf(self, pdf_path: str, filename: str = None) -> List[Document]:
135
- try:
136
- display_name = filename or os.path.basename(pdf_path)
137
- logger.info(f'Processing PDF from {display_name} using Hugging Face API')
138
- logger.info('Calling text extraction API...')
139
- extracted_text = self._call_extract_text_api(pdf_path, filename=filename)
140
- logger.info(f'Extracted text length: {len(extracted_text)}')
141
- if extracted_text:
142
- logger.info(f'Text preview: {extracted_text[:200]}...')
143
- else:
144
- logger.warning('Extracted text is empty')
145
- # Commented out: Table extraction not needed for now
146
- """
147
- logger.info('Calling table extraction API...')
148
- tables = self._call_extract_tables_api(pdf_path, filename=filename)
149
- logger.info(f'Extracted {len(tables)} tables')
150
- logger.info('Converting tables to markdown...')
151
- table_texts = [self._table_to_markdown(table) for table in tables]
152
- logger.info(f'Converted {len(table_texts)} tables to markdown')
153
- full_content = extracted_text + '\n\n' + '\n\n'.join(table_texts)
154
- """
155
- full_content = extracted_text
156
- logger.info(f'Combined content length: {len(full_content)}')
157
- logger.info('Splitting content into chunks...')
158
- chunks = self.text_splitter.create_documents([full_content])
159
- logger.info(f'Split into {len(chunks)} chunks')
160
- return chunks
161
- except Exception as e:
162
- logger.error(f'Error loading and processing PDF: {e}')
163
- raise
164
-
165
- def create_vector_store(self, chunks: List[Document]) -> FAISS:
166
- try:
167
- logger.info('Creating embeddings and vector store')
168
- logger.info(f'Processing {len(chunks)} chunks for embeddings')
169
- embeddings = HuggingFaceEmbeddings(model_name=self.settings.embedding_model)
170
- logger.info('Embeddings model loaded, creating FAISS vector store...')
171
- vector_store = FAISS.from_documents(chunks, embeddings)
172
- logger.info('Vector store created successfully')
173
- return vector_store
174
- except Exception as e:
175
- logger.error(f'Error creating vector store: {e}')
176
- raise
177
-
178
- def setup_rag_chain(self, vector_store: FAISS):
179
- logger.info('Setting up RAG chain with LangChain classic API (0.1.x)')
180
- from langchain_classic.chains import ConversationalRetrievalChain
181
- from langchain_classic.memory import ConversationBufferMemory
182
- from langchain_core.prompts import PromptTemplate
183
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
184
- qa_template = '''You are an expert assistant specializing in construction and tender documents, with deep knowledge of Bill of Quantities (BOQ) analysis. Your role is to provide accurate, helpful, and professional responses based solely on the provided context.
185
-
186
- Guidelines:
187
- - Always base your answers on the given context. Do not use external knowledge or assumptions.
188
- - For BOQ-related questions, provide detailed, structured information including item codes, descriptions, quantities, units, rates, and amounts where available.
189
- - If the context lacks specific information, respond with: "The requested information is not available in the provided document context."
190
- - Be concise yet comprehensive. Structure responses clearly (e.g., use bullet points or tables for lists).
191
- - Handle follow-up questions by referencing previous context in the conversation history.
192
- - Maintain neutrality and professionalism in all responses.
193
-
194
- {context}
195
-
196
- Question: {question}
197
- Answer:'''
198
- qa_prompt = PromptTemplate.from_template(qa_template)
199
- qa_chain = ConversationalRetrievalChain.from_llm(
200
- llm=self.llm,
201
- retriever=vector_store.as_retriever(),
202
- memory=memory,
203
- combine_docs_chain_kwargs={'prompt': qa_prompt},
204
- )
205
- logger.info('RAG chain set up successfully with LangChain classic API')
206
- return qa_chain
207
-
208
- def _extract_metadata(self, chunks: List[Document]) -> str:
209
- metadata_text = '\n\n'.join([chunk.page_content for chunk in chunks[:3]])
210
- metadata_prompt = f'''Extract key information from this tender document excerpt in a concise format:
211
- - Document Type
212
- - Project Name
213
- - Issuing Authority
214
- - Tender Number
215
- - Date
216
- - Location
217
-
218
- Document excerpt:
219
- {metadata_text[:2000]}
220
-
221
- Output only the facts, no extra analysis.'''
222
- logger.info('Invoking LLM for metadata extraction...')
223
- result = str(self.llm.invoke(metadata_prompt))
224
- logger.info('Metadata extraction completed')
225
- return result
226
-
227
- def _batch_chunks(self, chunks: List[Document], batch_size: int = 24) -> List[List[Document]]:
228
- return [chunks[i:i + batch_size] for i in range(0, len(chunks), batch_size)]
229
-
230
- def _extract_boq_from_batch(self, batch_text: str, batch_chunks: List[Document], batch_num: int) -> List[str]:
231
- extraction_prompt = '''Analyze this text and extract ONLY Bill of Quantities (BOQ) line items if present.
232
-
233
- Look for structured data with:
234
- - Item numbers or codes
235
- - Descriptions of work/materials (extract the complete, full description as it appears in the document, without truncation)
236
- - Quantities
237
- - Units (Nos, Sqm, Cum, m, etc.)
238
- - Rates/Unit prices
239
- - Total amounts
240
-
241
- If you find BOQ items, return them in this EXACT format (pipe-separated):
242
- ITEM_CODE|DESCRIPTION|QUANTITY|UNIT|RATE|AMOUNT|CONFIDENCE
243
-
244
- Where:
245
- - CONFIDENCE is a score (0-100%) based on how clearly and completely the data appears in the text. Use lower scores (e.g., 70-90%) if information is partially missing, inferred, or unclear. Use 100% only for complete, directly stated data.
246
-
247
- Rules for columns:
248
- - If an entire column has no values, omit that column.
249
- - For missing values, use "NA".
250
-
251
- Return multiple items on separate lines. If NO BOQ items are found, return: "NO_BOQ_ITEMS"
252
-
253
- Text to analyze:
254
- {batch_text}
255
-
256
- Extract only actual BOQ line items.'''
257
-
258
- prompt_text = batch_text[:30000]
259
- prompt = extraction_prompt.format(batch_text=prompt_text)
260
-
261
- try:
262
- logger.info(f'Invoking LLM for BOQ extraction on batch {batch_num}...')
263
- result = self.llm.invoke(prompt)
264
- logger.info(f'LLM response received for batch {batch_num}')
265
- if 'NO_BOQ_ITEMS' in str(result):
266
- logger.info(f'No BOQ items found in batch {batch_num}')
267
- return []
268
- boq_items = []
269
- lines = str(result).strip().split('\n')
270
- for line in lines:
271
- line = line.strip()
272
- if not line or '|' not in line:
273
- continue
274
- parts = [p.strip() for p in line.split('|')]
275
- if len(parts) < 7:
276
- parts += ['NA'] * (7 - len(parts))
277
- # Temp source
278
- parts.append("Unknown")
279
- # Per-item page detection
280
- desc = parts[1]
281
- search_str = desc[:30].strip().lower()
282
- batch_text_lower = batch_text.lower()
283
- pos = batch_text_lower.rfind(search_str)
284
- if pos != -1:
285
- marker_pattern = r"(?i)(?:---\s*)?page\s+(\d+)(?:\s*---)?"
286
- matches = list(re.finditer(marker_pattern, batch_text[:pos]))
287
- if matches:
288
- page = matches[-1].group(1)
289
- parts[7] = f"Page {page}"
290
- boq_items.append('|'.join(parts[:8]))
291
- logger.info(f'Extracted {len(boq_items)} BOQ items from batch {batch_num}')
292
- return boq_items
293
- except Exception as e:
294
- logger.warning(f'Error processing batch {batch_num}: {e}')
295
- return []
296
-
297
- def _format_boq_output(self, unique_items: List[str], metadata_result: str) -> str:
298
- logger.info('Formatting BOQ output...')
299
- if not unique_items:
300
- logger.info('No BOQ items to format')
301
- return f'''## DOCUMENT SUMMARY
302
- {metadata_result}
303
-
304
- ## DETAILED BILL OF QUANTITIES
305
- No BOQ items were found in this document.'''
306
-
307
- col_headers = ['Item No/Code', 'Description', 'Quantity', 'Unit', 'Rate', 'Amount', 'Confidence Score', 'Source']
308
- cols_present = [False] * 8
309
- normalized_items = []
310
- for item in unique_items:
311
- parts = [p.strip() for p in item.split('|')]
312
- if len(parts) < 8:
313
- parts += ['NA'] * (8 - len(parts))
314
- normalized_items.append(parts[:8])
315
- for i in range(8):
316
- if parts[i] and parts[i].upper() != 'NA':
317
- cols_present[i] = True
318
-
319
- col_indices = [i for i, present in enumerate(cols_present) if present]
320
- if 0 not in col_indices:
321
- col_indices.insert(0, 0)
322
- if 1 not in col_indices:
323
- col_indices.insert(1, 1)
324
-
325
- header_row = '| ' + ' | '.join([col_headers[i] for i in col_indices]) + ' |\n'
326
- sep_row = '|' + '|'.join(['-' * (len(col_headers[i]) + 2) for i in col_indices]) + '|\n'
327
-
328
- formatted_boq = f'''## DOCUMENT SUMMARY
329
- {metadata_result}
330
-
331
- ## DETAILED BILL OF QUANTITIES
332
- **Total Items Found:** {len(unique_items)}
333
-
334
- {header_row}{sep_row}'''
335
-
336
- for parts in normalized_items:
337
- # parts[1] remains full for complete descriptions
338
- parts[7] = parts[7][:50] if len(parts[7]) > 50 else parts[7] # Truncate source if too long
339
- row_vals = [parts[i] for i in col_indices]
340
- # Add % to confidence score if present
341
- if 6 in col_indices:
342
- conf_idx = col_indices.index(6)
343
- if row_vals[conf_idx] != 'NA':
344
- row_vals[conf_idx] = row_vals[conf_idx].rstrip('%') + '%'
345
- # Align confidence with source: if source unknown, set confidence to N/A
346
- if parts[7] == "Unknown":
347
- row_vals[conf_idx] = "N/A"
348
- formatted_boq += '| ' + ' | '.join(row_vals) + ' |\n'
349
-
350
- # formatted_boq += f'\n## SUMMARY\n- **Total Items:** {len(unique_items)}\n'
351
-
352
- try:
353
- s = formatted_boq.replace('\r\n', '\n').replace('\r', '\n')
354
- lines = [ln.lstrip() for ln in s.split('\n')]
355
- header_idx = next((i for i, ln in enumerate(lines) if ln.startswith('| ')), None)
356
- if header_idx and header_idx > 0 and lines[header_idx - 1].strip():
357
- lines.insert(header_idx, '')
358
- if header_idx:
359
- sep_idx = header_idx + 1
360
- if not (sep_idx < len(lines) and re.match(r'^\|\s*-+', lines[sep_idx])):
361
- cols = [c for c in lines[header_idx].split('|') if c.strip()]
362
- sep = '|' + '|'.join(['---' for _ in cols]) + '|'
363
- lines.insert(sep_idx, sep)
364
- formatted_boq = '\n'.join(lines).strip() + '\n\n'
365
- except Exception:
366
- pass
367
-
368
- return formatted_boq
369
-
370
- @retry_with_exponential_backoff(max_retries=3, initial_delay=2)
371
- def extract_boq_comprehensive(self, chunks: List[Document], vector_store: FAISS = None) -> str:
372
- try:
373
- logger.info(f'Starting comprehensive BOQ extraction from {len(chunks)} chunks')
374
- logger.info('Extracting document metadata...')
375
- metadata_result = self._extract_metadata(chunks)
376
- logger.info('Metadata extracted, creating batches...')
377
- batches = self._batch_chunks(chunks)
378
- logger.info(f'Created {len(batches)} batches')
379
- boq_items = []
380
- for batch_num, batch_chunks in enumerate(batches, 1):
381
- logger.info(f'Processing batch {batch_num}/{len(batches)} ({len(batch_chunks)} chunks)')
382
- chunk_texts = [chunk.page_content for chunk in batch_chunks]
383
- batch_text = '\n\n'.join(chunk_texts)
384
- logger.info(f'Batch text length: {len(batch_text)}')
385
- batch_items = self._extract_boq_from_batch(batch_text, batch_chunks, batch_num)
386
- boq_items.extend(batch_items)
387
- logger.info(f'Batch {batch_num} yielded {len(batch_items)} items')
388
- unique_items = list(dict.fromkeys(boq_items))
389
- logger.info(f'Found {len(unique_items)} unique BOQ items after deduplication')
390
- logger.info('Formatting BOQ output...')
391
- formatted_boq = self._format_boq_output(unique_items, metadata_result)
392
- logger.info('Comprehensive BOQ extraction completed successfully')
393
- return formatted_boq
394
- except Exception as e:
395
- logger.error(f'Error in comprehensive BOQ extraction: {e}')
396
- raise
397
-
398
- def check_consistency(chunks: List[Document], vector_store: FAISS, runs: int = 4) -> dict:
399
- """Run extraction multiple times and compute variance."""
400
- from difflib import SequenceMatcher
401
-
402
- results = []
403
- for _ in range(runs):
404
- try:
405
- boq = extract_boq_comprehensive(chunks, vector_store)
406
- results.append(boq)
407
- except Exception as e:
408
- logger.warning(f"Consistency run failed: {e}")
409
- results.append("")
410
-
411
- # Variance: Average similarity between pairs
412
- similarities = []
413
- for i in range(len(results)):
414
- for j in range(i+1, len(results)):
415
- if results[i] and results[j]:
416
- sim = SequenceMatcher(None, results[i], results[j]).ratio()
417
- similarities.append(sim)
418
-
419
- avg_similarity = sum(similarities) / len(similarities) if similarities else 0
420
- consistency_score = avg_similarity * 100
421
-
422
- # Average confidence from per-item scores
423
- all_confidences = []
424
- for boq in results:
425
- if boq:
426
- lines = boq.split('\n')
427
- confidence_idx = None
428
- for line in lines:
429
- line = line.strip()
430
- if '|' in line and 'Confidence' in line and not line.startswith('| ---'):
431
- # Header row: find index of Confidence
432
- parts = [p.strip() for p in line.split('|')[1:-1]]
433
- confidence_idx = next((i for i, p in enumerate(parts) if 'Confidence' in p), None)
434
- if confidence_idx is not None:
435
- break
436
- if confidence_idx is not None:
437
- for line in lines:
438
- if '|' in line and not line.startswith('| ---') and 'Confidence' not in line:
439
- parts = [p.strip() for p in line.split('|')[1:-1]]
440
- if len(parts) > confidence_idx:
441
- try:
442
- conf_str = parts[confidence_idx]
443
- if conf_str and conf_str != 'NA':
444
- # Remove % if present
445
- conf_str = conf_str.rstrip('%')
446
- conf = float(conf_str)
447
- all_confidences.append(conf)
448
- except (ValueError, IndexError):
449
- pass
450
-
451
- avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0
452
-
453
- return {
454
- "consistency_score": round(consistency_score, 2),
455
- "runs": runs,
456
- "successful_runs": len([r for r in results if r]),
457
- "avg_similarity": round(avg_similarity, 2),
458
- "avg_confidence": round(avg_confidence, 2), # New metric
459
- "total_confidence_scores": len(all_confidences)
460
- }
461
-
462
- # Global instance for backward compatibility
463
- processor = BOQProcessor()
464
-
465
- # Backward compatibility functions
466
- def load_and_process_pdf(pdf_path: str, filename: str = None) -> List[Document]:
467
- return processor.load_and_process_pdf(pdf_path, filename)
468
-
469
- def create_vector_store(chunks: List[Document]) -> FAISS:
470
- return processor.create_vector_store(chunks)
471
-
472
- def setup_rag_chain(vector_store: FAISS):
473
- return processor.setup_rag_chain(vector_store)
474
-
475
- @retry_with_exponential_backoff(max_retries=3, initial_delay=2)
476
- def extract_boq_comprehensive(chunks: List[Document], vector_store: FAISS = None) -> str:
477
- return processor.extract_boq_comprehensive(chunks, vector_store)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/settings.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized configuration management for BOQTenders.
3
+ All configurable parameters are defined here with sensible defaults.
4
+ """
5
+ import os
6
+ from typing import Optional
7
+ from pydantic_settings import BaseSettings
8
+ from pydantic import Field
9
+ import dotenv
10
+
11
+ # Load environment variables from .env file
12
+ dotenv.load_dotenv()
13
+
14
+
15
+ class LLMSettings(BaseSettings):
16
+ """LLM-related configuration."""
17
+
18
+ # API Keys
19
+ google_api_key: str = Field(default_factory=lambda: os.getenv("GOOGLE_API_KEY"),description="Google API key for Gemini")
20
+
21
+ # Model Configuration
22
+ model_name: str = Field(default="gemini-2.5-flash-lite",description="LLM model name to use")
23
+ temperature: float = Field(default=0.0,ge=0.0,le=2.0,description="LLM temperature (0.0 = deterministic)")
24
+ max_output_tokens: int = Field(default=8192,description="Maximum tokens in LLM response")
25
+
26
+
27
+ class EmbeddingSettings(BaseSettings):
28
+ """Embedding and vector store configuration."""
29
+
30
+ embedding_model: str = Field(default="sentence-transformers/all-MiniLM-L6-v2",description="HuggingFace embedding model name")
31
+
32
+ # Text Splitting
33
+ chunk_size: int = Field(default=1000,ge=100,le=10000,description="Size of text chunks for splitting")
34
+ chunk_overlap: int = Field(default=500,ge=0,description="Overlap between consecutive chunks")
35
+
36
+
37
+ class PDFExtractionSettings(BaseSettings):
38
+ """PDF extraction API configuration."""
39
+
40
+ hf_api_token: str = Field(default_factory=lambda: os.getenv("HF_API_TOKEN"),description="HuggingFace API token")
41
+ extraction_api_url: str = Field(default="https://point9-extract-text-and-table.hf.space/api/text",description="URL for PDF text extraction API")
42
+
43
+ start_page: int = Field(default=1,ge=1,description="Default start page for extraction")
44
+ end_page: int = Field(default=100,ge=1,description="Default end page for extraction")
45
+ request_timeout: int = Field(default=120,description="API request timeout in seconds")
46
+
47
+
48
+ class BOQExtractionSettings(BaseSettings):
49
+ """BOQ extraction specific configuration."""
50
+
51
+ batch_size: int = Field(default=25,ge=1,le=100,description="Number of chunks per batch for BOQ extraction")
52
+ max_prompt_length: int = Field(default=30000,description="Maximum characters in extraction prompt")
53
+ page_search_length: int = Field(default=30,description="Characters to use for page detection search")
54
+ source_max_length: int = Field(default=50,description="Maximum length for source column")
55
+
56
+
57
+ class ConsistencySettings(BaseSettings):
58
+ """Consistency check configuration."""
59
+
60
+ default_runs: int = Field(default=4,ge=2,le=10,description="Default number of runs for consistency check")
61
+ low_consistency_threshold: float = Field(default=80.0,ge=0.0,le=100.0,description="Threshold below which consistency is considered low")
62
+
63
+
64
+ class APISettings(BaseSettings):
65
+ """FastAPI server configuration."""
66
+
67
+ title: str = Field(default="BOQ Chatbot API",description="API title")
68
+ description: str = Field(default="API for extracting and querying BOQ from tender PDFs using RAG",description="API description")
69
+ version: str = Field(default="1.0.0",description="API version")
70
+ host: str = Field(default="0.0.0.0",description="Server host")
71
+ port: int = Field(default=8000,description="Server port")
72
+ debug: bool = Field(default=False,description="Enable debug mode")
73
+ docs_enabled: bool = Field(default=True,description="Enable API documentation endpoints")
74
+ cors_origins: list = Field(default=["*"],description="Allowed CORS origins")
75
+
76
+
77
+ class StreamlitSettings(BaseSettings):
78
+ """Streamlit UI configuration."""
79
+
80
+ page_title: str = Field(default="BOQ Agent",description="Page title")
81
+ page_icon: str = Field(default="πŸ“„",description="Page icon")
82
+ layout: str = Field(default="wide",description="Page layout (wide/centered)")
83
+
84
+
85
+ class Settings(BaseSettings):
86
+ """
87
+ Main settings class that aggregates all configuration sections.
88
+ Access via: settings.llm, settings.embedding, settings.pdf, etc.
89
+ """
90
+
91
+ # Global settings
92
+ log_level: str = Field(default="INFO",description="Logging level (DEBUG, INFO, WARNING, ERROR)")
93
+
94
+ llm: LLMSettings = Field(default_factory=LLMSettings)
95
+ embedding: EmbeddingSettings = Field(default_factory=EmbeddingSettings)
96
+ pdf: PDFExtractionSettings = Field(default_factory=PDFExtractionSettings)
97
+ boq: BOQExtractionSettings = Field(default_factory=BOQExtractionSettings)
98
+ consistency: ConsistencySettings = Field(default_factory=ConsistencySettings)
99
+ api: APISettings = Field(default_factory=APISettings)
100
+ streamlit: StreamlitSettings = Field(default_factory=StreamlitSettings)
101
+
102
+ class Config:
103
+ env_file = ".env"
104
+ env_file_encoding = "utf-8"
105
+ extra = "ignore" # Ignore extra env vars not defined in model
106
+
107
+
108
+ # Global settings instance
109
+ settings = Settings()
core/embeddings.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Embedding and vector store management module.
3
+ """
4
+ from typing import List, Optional
5
+ from loguru import logger
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain_core.documents import Document
10
+
11
+ from config.settings import settings
12
+
13
+
14
+ class EmbeddingService:
15
+ """
16
+ Handles text chunking, embeddings, and vector store operations.
17
+
18
+ Example:
19
+ service = EmbeddingService()
20
+ chunks = service.split_text(text)
21
+ vector_store = service.create_vector_store(chunks)
22
+ """
23
+
24
+ def __init__(self, embedding_model: str = None, chunk_size: int = None, chunk_overlap: int = None):
25
+ """
26
+ Initialize embedding service.
27
+
28
+ Args:
29
+ embedding_model: HuggingFace model name. Defaults to config value.
30
+ chunk_size: Size of text chunks. Defaults to config value.
31
+ chunk_overlap: Overlap between chunks. Defaults to config value.
32
+ """
33
+ self.embedding_model = embedding_model or settings.embedding.embedding_model
34
+ self.chunk_size = chunk_size or settings.embedding.chunk_size
35
+ self.chunk_overlap = chunk_overlap or settings.embedding.chunk_overlap
36
+
37
+ self._text_splitter = RecursiveCharacterTextSplitter(
38
+ chunk_size=self.chunk_size,
39
+ chunk_overlap=self.chunk_overlap
40
+ )
41
+ self._embeddings: Optional[HuggingFaceEmbeddings] = None
42
+
43
+ @property
44
+ def embeddings(self) -> HuggingFaceEmbeddings:
45
+ """Lazy-load embeddings model."""
46
+ if self._embeddings is None:
47
+ logger.info(f'Loading embeddings model: {self.embedding_model}')
48
+ self._embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model)
49
+ logger.info('Embeddings model loaded successfully')
50
+ return self._embeddings
51
+
52
+ def split_text(self, text: str) -> List[Document]:
53
+ """
54
+ Split text into chunks for processing.
55
+
56
+ Args:
57
+ text: Full text content to split.
58
+
59
+ Returns:
60
+ List of Document chunks.
61
+ """
62
+ logger.info(f'Splitting text of length {len(text)} into chunks...')
63
+ chunks = self._text_splitter.create_documents([text])
64
+ logger.info(f'Split into {len(chunks)} chunks')
65
+ return chunks
66
+
67
+ def create_vector_store(self, chunks: List[Document]) -> FAISS:
68
+ """
69
+ Create a FAISS vector store from document chunks.
70
+
71
+ Args:
72
+ chunks: List of Document objects.
73
+
74
+ Returns:
75
+ FAISS vector store instance.
76
+
77
+ Raises:
78
+ Exception: If vector store creation fails.
79
+ """
80
+ try:
81
+ logger.info(f'Creating vector store from {len(chunks)} chunks')
82
+ vector_store = FAISS.from_documents(chunks, self.embeddings)
83
+ logger.info('Vector store created successfully')
84
+ return vector_store
85
+ except Exception as e:
86
+ logger.error(f'Error creating vector store: {e}')
87
+ raise
88
+
89
+ def add_documents(self, vector_store: FAISS, documents: List[Document]) -> None:
90
+ """
91
+ Add new documents to an existing vector store.
92
+
93
+ Args:
94
+ vector_store: Existing FAISS vector store.
95
+ documents: Documents to add.
96
+ """
97
+ logger.info(f'Adding {len(documents)} documents to vector store')
98
+ vector_store.add_documents(documents)
99
+ logger.info('Documents added successfully')
100
+
101
+ def similarity_search(
102
+ self,
103
+ vector_store: FAISS,
104
+ query: str,
105
+ k: int = 4
106
+ ) -> List[Document]:
107
+ """
108
+ Perform similarity search on vector store.
109
+
110
+ Args:
111
+ vector_store: FAISS vector store to search.
112
+ query: Search query.
113
+ k: Number of results to return.
114
+
115
+ Returns:
116
+ List of most similar documents.
117
+ """
118
+ return vector_store.similarity_search(query, k=k)
core/llm.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM client wrapper for Google Generative AI (Gemini).
3
+ """
4
+ from typing import Optional
5
+ from loguru import logger
6
+ from langchain_google_genai import GoogleGenerativeAI
7
+
8
+ from config.settings import settings
9
+
10
+
11
+ class LLMClient:
12
+ """
13
+ Wrapper for LLM interactions with Google Generative AI.
14
+
15
+ Example:
16
+ client = LLMClient()
17
+ response = client.invoke("What is a BOQ?")
18
+ """
19
+
20
+ def __init__(self, model_name: str = None, temperature: float = None, api_key: str = None):
21
+ """
22
+ Initialize LLM client.
23
+
24
+ Args:
25
+ model_name: Model to use. Defaults to config value.
26
+ temperature: Sampling temperature. Defaults to config value.
27
+ api_key: Google API key. Defaults to config value.
28
+ """
29
+ self.model_name = model_name or settings.llm.model_name
30
+ self.temperature = temperature if temperature is not None else settings.llm.temperature
31
+ self.api_key = api_key or settings.llm.google_api_key
32
+
33
+ self._llm: Optional[GoogleGenerativeAI] = None
34
+
35
+ @property
36
+ def llm(self) -> GoogleGenerativeAI:
37
+ """Lazy-load LLM instance."""
38
+ if self._llm is None:
39
+ logger.info(f'Initializing LLM: {self.model_name} (temp={self.temperature})')
40
+ self._llm = GoogleGenerativeAI(
41
+ model=self.model_name,
42
+ temperature=self.temperature,
43
+ google_api_key=self.api_key
44
+ )
45
+ logger.info('LLM initialized successfully')
46
+ return self._llm
47
+
48
+ def invoke(self, prompt: str) -> str:
49
+ """
50
+ Send a prompt to the LLM and get a response.
51
+
52
+ Args:
53
+ prompt: The prompt text.
54
+
55
+ Returns:
56
+ LLM response as string.
57
+ """
58
+ logger.debug(f'Invoking LLM with prompt of length {len(prompt)}')
59
+ result = str(self.llm.invoke(prompt))
60
+ logger.debug(f'LLM response received, length: {len(result)}')
61
+ return result
62
+
63
+ def batch_invoke(self, prompts: list[str]) -> list[str]:
64
+ """
65
+ Send multiple prompts to the LLM.
66
+
67
+ Args:
68
+ prompts: List of prompt texts.
69
+
70
+ Returns:
71
+ List of LLM responses.
72
+ """
73
+ logger.info(f'Batch invoking LLM with {len(prompts)} prompts')
74
+ results = []
75
+ for i, prompt in enumerate(prompts, 1):
76
+ logger.debug(f'Processing prompt {i}/{len(prompts)}')
77
+ results.append(self.invoke(prompt))
78
+ return results
core/pdf_extractor.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF text extraction module using external API.
3
+ """
4
+ import os
5
+ from typing import Optional
6
+ import requests
7
+ from loguru import logger
8
+
9
+ from config.settings import settings
10
+
11
+
12
+ class PDFExtractor:
13
+ """
14
+ Handles PDF text extraction using HuggingFace Space API.
15
+
16
+ Example:
17
+ extractor = PDFExtractor()
18
+ text = extractor.extract_text("document.pdf")
19
+ """
20
+
21
+ def __init__(self, api_url: str = None, api_token: str = None, timeout: int = None):
22
+ """
23
+ Initialize PDF extractor.
24
+
25
+ Args:
26
+ api_url: URL for extraction API. Defaults to config value.
27
+ api_token: HuggingFace API token. Defaults to config value.
28
+ timeout: Request timeout in seconds. Defaults to config value.
29
+ """
30
+ self.api_url = api_url or settings.pdf.extraction_api_url
31
+ self.api_token = api_token or settings.pdf.hf_api_token
32
+ self.timeout = timeout or settings.pdf.request_timeout
33
+
34
+ def extract_text(self, pdf_path: str, start_page: int = None, end_page: int = None, filename: str = None) -> str:
35
+ """
36
+ Extract text from a PDF file.
37
+
38
+ Args:
39
+ pdf_path: Path to the PDF file.
40
+ start_page: Starting page number (1-indexed). Defaults to config value.
41
+ end_page: Ending page number. Defaults to config value.
42
+ filename: Display name for logging. Defaults to basename of pdf_path.
43
+
44
+ Returns:
45
+ Extracted text content.
46
+
47
+ Raises:
48
+ requests.RequestException: If API request fails.
49
+ FileNotFoundError: If PDF file doesn't exist.
50
+ """
51
+ # Use config defaults if not specified
52
+ start_page = start_page or settings.pdf.start_page
53
+ end_page = end_page or settings.pdf.end_page
54
+ display_name = filename or os.path.basename(pdf_path)
55
+
56
+ logger.info(f'Starting text extraction for {display_name} (pages {start_page}-{end_page})')
57
+
58
+ if not os.path.exists(pdf_path):
59
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
60
+
61
+ with open(pdf_path, 'rb') as f:
62
+ files = {'file': (os.path.basename(pdf_path), f, 'application/pdf')}
63
+ data = {
64
+ 'start_page': start_page,
65
+ 'end_page': end_page,
66
+ 'filename': os.path.basename(pdf_path)
67
+ }
68
+ headers = {'Authorization': f'Bearer {self.api_token}'}
69
+
70
+ response = requests.post(
71
+ self.api_url,
72
+ files=files,
73
+ data=data,
74
+ headers=headers,
75
+ timeout=self.timeout
76
+ )
77
+ response.raise_for_status()
78
+
79
+ json_response = response.json()
80
+ if isinstance(json_response, dict):
81
+ result = json_response.get('result', '')
82
+ else:
83
+ logger.error(f"Unexpected response format: {json_response}")
84
+ result = ''
85
+
86
+ logger.info(f'Text extraction completed, response length: {len(result)}')
87
+ return result
88
+
89
+ def extract_text_preview(self, pdf_path: str, max_chars: int = 200) -> str:
90
+ """
91
+ Extract and return a preview of the PDF text.
92
+
93
+ Args:
94
+ pdf_path: Path to the PDF file.
95
+ max_chars: Maximum characters to return.
96
+
97
+ Returns:
98
+ Preview of extracted text.
99
+ """
100
+ text = self.extract_text(pdf_path, start_page=1, end_page=5)
101
+ return text[:max_chars] + "..." if len(text) > max_chars else text
core/rag_chain.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG chain builder for conversational retrieval.
3
+ """
4
+ from loguru import logger
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_classic.chains import ConversationalRetrievalChain
7
+ from langchain_classic.memory import ConversationBufferMemory
8
+ from langchain_core.prompts import PromptTemplate
9
+
10
+ from core.llm import LLMClient
11
+ from prompts.get_prompts import QA_TEMPLATE
12
+
13
+
14
+ class RAGChainBuilder:
15
+ """
16
+ Builder for RAG (Retrieval-Augmented Generation) chains.
17
+
18
+ Example:
19
+ builder = RAGChainBuilder()
20
+ chain = builder.build(vector_store)
21
+ response = chain({"question": "What is the total quantity?"})
22
+ """
23
+
24
+ def __init__(self, llm_client: LLMClient = None):
25
+ """
26
+ Initialize RAG chain builder.
27
+
28
+ Args:
29
+ llm_client: LLM client instance. Creates new one if not provided.
30
+ """
31
+ self.llm_client = llm_client or LLMClient()
32
+
33
+ def build(self, vector_store: FAISS, qa_template: str = None, memory_key: str = "chat_history", return_messages: bool = True) -> ConversationalRetrievalChain:
34
+ """
35
+ Build a conversational retrieval chain.
36
+
37
+ Args:
38
+ vector_store: FAISS vector store with document embeddings.
39
+ qa_template: Custom Q&A prompt template. Defaults to standard template.
40
+ memory_key: Key for conversation memory.
41
+ return_messages: Whether to return messages in memory.
42
+
43
+ Returns:
44
+ Configured ConversationalRetrievalChain.
45
+ """
46
+ logger.info('Building RAG chain with LangChain classic API')
47
+
48
+ # Create memory
49
+ memory = ConversationBufferMemory(
50
+ memory_key=memory_key,
51
+ return_messages=return_messages
52
+ )
53
+
54
+ # Create prompt
55
+ template = qa_template or QA_TEMPLATE
56
+ qa_prompt = PromptTemplate.from_template(template)
57
+
58
+ # Build chain
59
+ chain = ConversationalRetrievalChain.from_llm(
60
+ llm=self.llm_client.llm,
61
+ retriever=vector_store.as_retriever(),
62
+ memory=memory,
63
+ combine_docs_chain_kwargs={'prompt': qa_prompt},
64
+ )
65
+
66
+ logger.info('RAG chain built successfully')
67
+ return chain
68
+
69
+ def build_simple_retriever(self, vector_store: FAISS, k: int = 4):
70
+ """
71
+ Build a simple retriever without conversation memory.
72
+
73
+ Args:
74
+ vector_store: FAISS vector store.
75
+ k: Number of documents to retrieve.
76
+
77
+ Returns:
78
+ Retriever instance.
79
+ """
80
+ return vector_store.as_retriever(search_kwargs={"k": k})
prompts/get_prompts.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Load prompt templates from YAML file for BOQ extraction.
3
+ """
4
+ from pathlib import Path
5
+ import yaml
6
+
7
+ # Load templates from YAML
8
+ _TEMPLATES_PATH = Path(__file__).parent / "templates.yaml"
9
+
10
+ with open(_TEMPLATES_PATH, 'r', encoding='utf-8') as f:
11
+ _templates = yaml.safe_load(f)
12
+
13
+ # Export as module constants
14
+ QA_TEMPLATE = _templates['qa_template']
15
+ METADATA_EXTRACTION_TEMPLATE = _templates['metadata_extraction_template']
16
+ BOQ_EXTRACTION_TEMPLATE = _templates['boq_extraction_template']
17
+ BOQ_COLUMN_HEADERS = _templates['boq_column_headers']
prompts/templates.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BOQTenders Prompt Templates
2
+ # YAML format for better production management and versioning
3
+
4
+ qa_template: |
5
+ You are an expert assistant specializing in construction and tender documents, with deep knowledge of Bill of Quantities (BOQ) analysis. Your role is to provide accurate, helpful, and professional responses based solely on the provided context.
6
+
7
+ Guidelines:
8
+ - Always base your answers on the given context. Do not use external knowledge or assumptions.
9
+ - For BOQ-related questions, provide detailed, structured information including item codes, descriptions, quantities, units, rates, and amounts where available.
10
+ - If the context lacks specific information, respond with: "The requested information is not available in the provided document context."
11
+ - Be concise yet comprehensive. Structure responses clearly (e.g., use bullet points or tables for lists).
12
+ - Handle follow-up questions by referencing previous context in the conversation history.
13
+ - Maintain neutrality and professionalism in all responses.
14
+
15
+ {context}
16
+
17
+ Question: {question}
18
+ Answer:
19
+
20
+ metadata_extraction_template: |
21
+ Extract key information from this tender document excerpt in a concise format:
22
+ - Document Type
23
+ - Project Name
24
+ - Issuing Authority
25
+ - Tender Number
26
+ - Date
27
+ - Location
28
+
29
+ Document excerpt:
30
+ {document_text}
31
+
32
+ Output only the facts, no extra analysis.
33
+
34
+ boq_extraction_template: |
35
+ Analyze this text and extract ONLY Bill of Quantities (BOQ) line items if present.
36
+
37
+ Look for structured data with:
38
+ - Item numbers or codes
39
+ - Descriptions of work/materials (extract the complete, full description as it appears in the document, without truncation)
40
+ - Quantities
41
+ - Units (Nos, Sqm, Cum, m, etc.)
42
+ - Rates/Unit prices
43
+ - Total amounts
44
+
45
+ If you find BOQ items, return them in this EXACT format (pipe-separated):
46
+ ITEM_CODE|DESCRIPTION|QUANTITY|UNIT|RATE|AMOUNT|CONFIDENCE
47
+
48
+ Where:
49
+ - CONFIDENCE is a score (0-100%) based on how clearly and completely the data appears in the text. Use lower scores (e.g., 70-90%) if information is partially missing, inferred, or unclear. Use 100% only for complete, directly stated data.
50
+
51
+ Rules for columns:
52
+ - If an entire column has no values, omit that column.
53
+ - For missing values, use "NA".
54
+
55
+ Return multiple items on separate lines. If NO BOQ items are found, return: "NO_BOQ_ITEMS"
56
+
57
+ Text to analyze:
58
+ {batch_text}
59
+
60
+ Extract only actual BOQ line items.
61
+
62
+ # Column headers for BOQ output table
63
+ boq_column_headers:
64
+ - "Item No/Code"
65
+ - "Description"
66
+ - "Quantity"
67
+ - "Unit"
68
+ - "Rate"
69
+ - "Amount"
70
+ - "Confidence Score"
71
+ - "Source"
requirements.txt CHANGED
@@ -1,15 +1,51 @@
1
- langchain
2
- langchain-community
3
- pdfplumber
4
- faiss-cpu
5
- sentence-transformers
6
- streamlit
7
- langchain-google-genai
8
- langchain-huggingface
9
- fastapi
10
- uvicorn
11
- pydantic-settings
12
- loguru
13
- python-dotenv
14
- python-multipart
15
- requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BOQTenders - Requirements
2
+ # Python 3.9+
3
+
4
+ # ============================================
5
+ # Core Dependencies
6
+ # ============================================
7
+ python-dotenv>=1.0.0
8
+ pydantic>=2.0.0
9
+ pydantic-settings>=2.0.0
10
+ loguru>=0.7.0
11
+ PyYAML>=6.0.0
12
+
13
+ # ============================================
14
+ # LLM & AI
15
+ # ============================================
16
+ # LangChain 0.1.x (Classic API - stable)
17
+ langchain>=0.1.0,<0.2.0
18
+ langchain-community>=0.0.1,<0.1.0
19
+ langchain-google-genai>=0.0.5
20
+
21
+ # Vector Store
22
+ faiss-cpu>=1.7.4
23
+
24
+ # Embeddings
25
+ sentence-transformers>=2.2.0
26
+
27
+ # ============================================
28
+ # PDF Processing
29
+ # ============================================
30
+ PyPDF2>=3.0.0
31
+ requests>=2.31.0
32
+
33
+ # ============================================
34
+ # Web Framework
35
+ # ============================================
36
+ # API
37
+ fastapi>=0.100.0
38
+ uvicorn[standard]>=0.23.0
39
+ python-multipart>=0.0.6
40
+
41
+ # UI
42
+ streamlit>=1.28.0
43
+
44
+ # ============================================
45
+ # Development (optional)
46
+ # ============================================
47
+ # pytest>=7.4.0
48
+ # pytest-asyncio>=0.21.0
49
+ # black>=23.0.0
50
+ # isort>=5.12.0
51
+ # mypy>=1.5.0
services/boq_extractor.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BOQ extraction service for extracting Bill of Quantities from documents.
3
+ """
4
+ import re
5
+ from typing import List, Optional, Tuple
6
+ from loguru import logger
7
+ from langchain_core.documents import Document
8
+ from langchain_community.vectorstores import FAISS
9
+
10
+ from config.settings import settings
11
+ from core.llm import LLMClient
12
+ from prompts.get_prompts import (
13
+ METADATA_EXTRACTION_TEMPLATE,
14
+ BOQ_EXTRACTION_TEMPLATE,
15
+ BOQ_COLUMN_HEADERS,
16
+ )
17
+
18
+ # Page marker pattern for detecting source pages
19
+ PAGE_MARKER_PATTERN = r"(?i)(?:---\s*)?page\s+(\d+)(?:\s*---)?"
20
+
21
+
22
+ class BOQExtractor:
23
+ """
24
+ Extracts Bill of Quantities (BOQ) data from document chunks.
25
+
26
+ Example:
27
+ extractor = BOQExtractor()
28
+ boq_output = extractor.extract(chunks)
29
+ """
30
+
31
+ def __init__(self, llm_client: LLMClient = None, batch_size: int = None, max_prompt_length: int = None, page_search_length: int = None):
32
+ """
33
+ Initialize BOQ extractor.
34
+
35
+ Args:
36
+ llm_client: LLM client instance. Creates new one if not provided.
37
+ batch_size: Number of chunks per batch. Defaults to config value.
38
+ max_prompt_length: Max chars in prompt. Defaults to config value.
39
+ page_search_length: Chars for page detection. Defaults to config value.
40
+ """
41
+ self.llm_client = llm_client or LLMClient()
42
+ self.batch_size = batch_size or settings.boq.batch_size
43
+ self.max_prompt_length = max_prompt_length or settings.boq.max_prompt_length
44
+ self.page_search_length = page_search_length or settings.boq.page_search_length
45
+ self.source_max_length = settings.boq.source_max_length
46
+
47
+ def _batch_chunks(self, chunks: List[Document]) -> List[List[Document]]:
48
+ """Split chunks into batches."""
49
+ return [
50
+ chunks[i:i + self.batch_size]
51
+ for i in range(0, len(chunks), self.batch_size)
52
+ ]
53
+
54
+ def _extract_metadata(self, chunks: List[Document]) -> str:
55
+ """Extract document metadata from first few chunks."""
56
+ metadata_text = '\n\n'.join([chunk.page_content for chunk in chunks[:3]])
57
+ prompt = METADATA_EXTRACTION_TEMPLATE.format(
58
+ document_text=metadata_text[:2000]
59
+ )
60
+
61
+ logger.info('Invoking LLM for metadata extraction...')
62
+ result = self.llm_client.invoke(prompt)
63
+ logger.info('Metadata extraction completed')
64
+ return result
65
+
66
+ def _detect_page_source(self, desc: str, batch_text: str) -> str:
67
+ """
68
+ Detect the page number for a BOQ item based on its description.
69
+
70
+ Args:
71
+ desc: Item description.
72
+ batch_text: Full batch text to search in.
73
+
74
+ Returns:
75
+ Page source string (e.g., "Page 5") or "Unknown".
76
+ """
77
+ search_str = desc[:self.page_search_length].strip().lower()
78
+ batch_text_lower = batch_text.lower()
79
+ pos = batch_text_lower.rfind(search_str)
80
+
81
+ if pos != -1:
82
+ matches = list(re.finditer(PAGE_MARKER_PATTERN, batch_text[:pos]))
83
+ if matches:
84
+ page = matches[-1].group(1)
85
+ return f"Page {page}"
86
+
87
+ return "Unknown"
88
+
89
+ def _parse_boq_line(self, line: str, batch_text: str) -> Optional[str]:
90
+ """
91
+ Parse a single BOQ line from LLM output.
92
+
93
+ Args:
94
+ line: Raw line from LLM output.
95
+ batch_text: Full batch text for page detection.
96
+
97
+ Returns:
98
+ Formatted BOQ item string or None if invalid.
99
+ """
100
+ line = line.strip()
101
+ if not line or '|' not in line:
102
+ return None
103
+
104
+ parts = [p.strip() for p in line.split('|')]
105
+ if len(parts) < 7:
106
+ parts += ['NA'] * (7 - len(parts))
107
+
108
+ # Add source column
109
+ parts.append("Unknown")
110
+
111
+ # Detect page source
112
+ desc = parts[1]
113
+ parts[7] = self._detect_page_source(desc, batch_text)
114
+
115
+ return '|'.join(parts[:8])
116
+
117
+ def _extract_from_batch(self, batch_text: str, batch_num: int) -> List[str]:
118
+ """
119
+ Extract BOQ items from a single batch.
120
+
121
+ Args:
122
+ batch_text: Combined text from batch chunks.
123
+ batch_num: Batch number for logging.
124
+
125
+ Returns:
126
+ List of BOQ item strings.
127
+ """
128
+ prompt_text = batch_text[:self.max_prompt_length]
129
+ prompt = BOQ_EXTRACTION_TEMPLATE.format(batch_text=prompt_text)
130
+
131
+ try:
132
+ logger.info(f'Invoking LLM for BOQ extraction on batch {batch_num}...')
133
+ result = self.llm_client.invoke(prompt)
134
+ logger.info(f'LLM response received for batch {batch_num}')
135
+
136
+ if 'NO_BOQ_ITEMS' in result:
137
+ logger.info(f'No BOQ items found in batch {batch_num}')
138
+ return []
139
+
140
+ boq_items = []
141
+ lines = result.strip().split('\n')
142
+
143
+ for line in lines:
144
+ parsed = self._parse_boq_line(line, batch_text)
145
+ if parsed:
146
+ boq_items.append(parsed)
147
+
148
+ logger.info(f'Extracted {len(boq_items)} BOQ items from batch {batch_num}')
149
+ return boq_items
150
+
151
+ except Exception as e:
152
+ logger.warning(f'Error processing batch {batch_num}: {e}')
153
+ return []
154
+
155
+ def _format_output(self, unique_items: List[str], metadata_result: str) -> str:
156
+ """
157
+ Format extracted BOQ items into markdown output.
158
+
159
+ Args:
160
+ unique_items: List of unique BOQ item strings.
161
+ metadata_result: Document metadata.
162
+
163
+ Returns:
164
+ Formatted markdown string.
165
+ """
166
+ logger.info('Formatting BOQ output...')
167
+
168
+ if not unique_items:
169
+ logger.info('No BOQ items to format')
170
+ return f'''## DOCUMENT SUMMARY
171
+ {metadata_result}
172
+
173
+ ## DETAILED BILL OF QUANTITIES
174
+ No BOQ items were found in this document.'''
175
+
176
+ # Determine which columns have data
177
+ cols_present = [False] * 8
178
+ normalized_items = []
179
+
180
+ for item in unique_items:
181
+ parts = [p.strip() for p in item.split('|')]
182
+ if len(parts) < 8:
183
+ parts += ['NA'] * (8 - len(parts))
184
+ normalized_items.append(parts[:8])
185
+
186
+ for i in range(8):
187
+ if parts[i] and parts[i].upper() != 'NA':
188
+ cols_present[i] = True
189
+
190
+ # Build column indices (always include item code and description)
191
+ col_indices = [i for i, present in enumerate(cols_present) if present]
192
+ if 0 not in col_indices:
193
+ col_indices.insert(0, 0)
194
+ if 1 not in col_indices:
195
+ col_indices.insert(1, 1)
196
+
197
+ # Build header and separator rows
198
+ header_row = '| ' + ' | '.join([BOQ_COLUMN_HEADERS[i] for i in col_indices]) + ' |\n'
199
+ sep_row = '|' + '|'.join(['-' * (len(BOQ_COLUMN_HEADERS[i]) + 2) for i in col_indices]) + '|\n'
200
+
201
+ formatted_boq = f'''## DOCUMENT SUMMARY
202
+ {metadata_result}
203
+
204
+ ## DETAILED BILL OF QUANTITIES
205
+ **Total Items Found:** {len(unique_items)}
206
+
207
+ {header_row}{sep_row}'''
208
+
209
+ # Add data rows
210
+ for parts in normalized_items:
211
+ # Truncate source if too long
212
+ parts[7] = parts[7][:self.source_max_length] if len(parts[7]) > self.source_max_length else parts[7]
213
+ row_vals = [parts[i] for i in col_indices]
214
+
215
+ # Format confidence score
216
+ if 6 in col_indices:
217
+ conf_idx = col_indices.index(6)
218
+ if row_vals[conf_idx] != 'NA':
219
+ row_vals[conf_idx] = row_vals[conf_idx].rstrip('%') + '%'
220
+ if parts[7] == "Unknown":
221
+ row_vals[conf_idx] = "N/A"
222
+
223
+ formatted_boq += '| ' + ' | '.join(row_vals) + ' |\n'
224
+
225
+ # Clean up formatting
226
+ try:
227
+ s = formatted_boq.replace('\r\n', '\n').replace('\r', '\n')
228
+ lines = [ln.lstrip() for ln in s.split('\n')]
229
+ header_idx = next((i for i, ln in enumerate(lines) if ln.startswith('| ')), None)
230
+
231
+ if header_idx and header_idx > 0 and lines[header_idx - 1].strip():
232
+ lines.insert(header_idx, '')
233
+
234
+ if header_idx:
235
+ sep_idx = header_idx + 1
236
+ if not (sep_idx < len(lines) and re.match(r'^\|\s*-+', lines[sep_idx])):
237
+ cols = [c for c in lines[header_idx].split('|') if c.strip()]
238
+ sep = '|' + '|'.join(['---' for _ in cols]) + '|'
239
+ lines.insert(sep_idx, sep)
240
+
241
+ formatted_boq = '\n'.join(lines).strip() + '\n\n'
242
+ except Exception:
243
+ pass
244
+
245
+ return formatted_boq
246
+
247
+ def extract(self, chunks: List[Document], vector_store: FAISS = None) -> str:
248
+ """
249
+ Extract BOQ from document chunks.
250
+
251
+ Args:
252
+ chunks: List of Document chunks.
253
+ vector_store: Optional vector store (not used currently).
254
+
255
+ Returns:
256
+ Formatted BOQ output as markdown string.
257
+ """
258
+ try:
259
+ logger.info(f'Starting comprehensive BOQ extraction from {len(chunks)} chunks')
260
+
261
+ # Extract metadata
262
+ logger.info('Extracting document metadata...')
263
+ metadata_result = self._extract_metadata(chunks)
264
+
265
+ # Create batches
266
+ logger.info('Creating batches...')
267
+ batches = self._batch_chunks(chunks)
268
+ logger.info(f'Created {len(batches)} batches')
269
+
270
+ # Extract from each batch
271
+ boq_items = []
272
+ for batch_num, batch_chunks in enumerate(batches, 1):
273
+ logger.info(f'Processing batch {batch_num}/{len(batches)} ({len(batch_chunks)} chunks)')
274
+
275
+ chunk_texts = [chunk.page_content for chunk in batch_chunks]
276
+ batch_text = '\n\n'.join(chunk_texts)
277
+ logger.info(f'Batch text length: {len(batch_text)}')
278
+
279
+ batch_items = self._extract_from_batch(batch_text, batch_num)
280
+ boq_items.extend(batch_items)
281
+ logger.info(f'Batch {batch_num} yielded {len(batch_items)} items')
282
+
283
+ # Deduplicate
284
+ unique_items = list(dict.fromkeys(boq_items))
285
+ logger.info(f'Found {len(unique_items)} unique BOQ items after deduplication')
286
+
287
+ # Format output
288
+ logger.info('Formatting BOQ output...')
289
+ formatted_boq = self._format_output(unique_items, metadata_result)
290
+
291
+ logger.info('Comprehensive BOQ extraction completed successfully')
292
+ return formatted_boq
293
+
294
+ except Exception as e:
295
+ logger.error(f'Error in comprehensive BOQ extraction: {e}')
296
+ raise
services/consistency.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Consistency checking service for evaluating BOQ extraction reliability.
3
+ """
4
+ from typing import List, Dict, Any
5
+ from difflib import SequenceMatcher
6
+ from loguru import logger
7
+ from langchain_core.documents import Document
8
+ from langchain_community.vectorstores import FAISS
9
+
10
+ from config.settings import settings
11
+ from services.boq_extractor import BOQExtractor
12
+
13
+
14
+ class ConsistencyChecker:
15
+ """
16
+ Checks consistency of BOQ extractions across multiple runs.
17
+
18
+ Example:
19
+ checker = ConsistencyChecker()
20
+ result = checker.check(chunks, vector_store, runs=4)
21
+ print(f"Consistency: {result['consistency_score']}%")
22
+ """
23
+
24
+ def __init__(self, boq_extractor: BOQExtractor = None, default_runs: int = None, low_threshold: float = None):
25
+ """
26
+ Initialize consistency checker.
27
+
28
+ Args:
29
+ boq_extractor: BOQ extractor instance. Creates new one if not provided.
30
+ default_runs: Default number of extraction runs. Defaults to config value.
31
+ low_threshold: Threshold for low consistency warning. Defaults to config value.
32
+ """
33
+ self.boq_extractor = boq_extractor or BOQExtractor()
34
+ self.default_runs = default_runs or settings.consistency.default_runs
35
+ self.low_threshold = low_threshold or settings.consistency.low_consistency_threshold
36
+
37
+ def _calculate_similarity(self, results: List[str]) -> float:
38
+ """
39
+ Calculate average pairwise similarity between results.
40
+
41
+ Args:
42
+ results: List of BOQ extraction results.
43
+
44
+ Returns:
45
+ Average similarity score (0.0 to 1.0).
46
+ """
47
+ similarities = []
48
+
49
+ for i in range(len(results)):
50
+ for j in range(i + 1, len(results)):
51
+ if results[i] and results[j]:
52
+ sim = SequenceMatcher(None, results[i], results[j]).ratio()
53
+ similarities.append(sim)
54
+
55
+ return sum(similarities) / len(similarities) if similarities else 0
56
+
57
+ def _extract_confidence_scores(self, boq: str) -> List[float]:
58
+ """
59
+ Extract confidence scores from BOQ output.
60
+
61
+ Args:
62
+ boq: Formatted BOQ output string.
63
+
64
+ Returns:
65
+ List of confidence score values.
66
+ """
67
+ if not boq:
68
+ return []
69
+
70
+ lines = boq.split('\n')
71
+ confidence_idx = None
72
+ confidences = []
73
+
74
+ # Find confidence column index from header
75
+ for line in lines:
76
+ line = line.strip()
77
+ if '|' in line and 'Confidence' in line and not line.startswith('| ---'):
78
+ parts = [p.strip() for p in line.split('|')[1:-1]]
79
+ confidence_idx = next(
80
+ (i for i, p in enumerate(parts) if 'Confidence' in p),
81
+ None
82
+ )
83
+ if confidence_idx is not None:
84
+ break
85
+
86
+ if confidence_idx is None:
87
+ return []
88
+
89
+ # Extract confidence values from data rows
90
+ for line in lines:
91
+ if '|' in line and not line.startswith('| ---') and 'Confidence' not in line:
92
+ parts = [p.strip() for p in line.split('|')[1:-1]]
93
+ if len(parts) > confidence_idx:
94
+ try:
95
+ conf_str = parts[confidence_idx]
96
+ if conf_str and conf_str != 'NA' and conf_str != 'N/A':
97
+ conf_str = conf_str.rstrip('%')
98
+ conf = float(conf_str)
99
+ confidences.append(conf)
100
+ except (ValueError, IndexError):
101
+ pass
102
+
103
+ return confidences
104
+
105
+ def check(self, chunks: List[Document], vector_store: FAISS, runs: int = None) -> Dict[str, Any]:
106
+ """
107
+ Run multiple BOQ extractions and compute consistency metrics.
108
+
109
+ Args:
110
+ chunks: Document chunks to extract from.
111
+ vector_store: Vector store (passed to extractor).
112
+ runs: Number of extraction runs. Defaults to config value.
113
+
114
+ Returns:
115
+ Dictionary with consistency metrics:
116
+ - consistency_score: Overall consistency percentage
117
+ - runs: Number of runs attempted
118
+ - successful_runs: Number of successful runs
119
+ - avg_similarity: Average pairwise similarity
120
+ - avg_confidence: Average confidence score
121
+ - total_confidence_scores: Number of confidence scores found
122
+ - is_low_consistency: Whether consistency is below threshold
123
+ """
124
+ runs = runs or self.default_runs
125
+ logger.info(f'Starting consistency check with {runs} runs')
126
+
127
+ results = []
128
+ for run_num in range(runs):
129
+ try:
130
+ logger.info(f'Consistency run {run_num + 1}/{runs}')
131
+ boq = self.boq_extractor.extract(chunks, vector_store)
132
+ results.append(boq)
133
+ except Exception as e:
134
+ logger.warning(f"Consistency run {run_num + 1} failed: {e}")
135
+ results.append("")
136
+
137
+ # Calculate similarity
138
+ avg_similarity = self._calculate_similarity(results)
139
+ consistency_score = avg_similarity * 100
140
+
141
+ # Extract and average confidence scores
142
+ all_confidences = []
143
+ for boq in results:
144
+ confidences = self._extract_confidence_scores(boq)
145
+ all_confidences.extend(confidences)
146
+
147
+ avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0
148
+ successful_runs = len([r for r in results if r])
149
+
150
+ result = {
151
+ "consistency_score": round(consistency_score, 2),
152
+ "runs": runs,
153
+ "successful_runs": successful_runs,
154
+ "avg_similarity": round(avg_similarity, 2),
155
+ "avg_confidence": round(avg_confidence, 2),
156
+ "total_confidence_scores": len(all_confidences),
157
+ "is_low_consistency": consistency_score < self.low_threshold
158
+ }
159
+
160
+ logger.info(f'Consistency check completed: {result}')
161
+ return result
streamlit_app.py CHANGED
@@ -1,143 +1,310 @@
1
- import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import tempfile
3
- import os
4
- import boq_processor
5
-
6
- st.set_page_config(page_title="BOQ Agent", page_icon="πŸ“„", layout="wide")
7
-
8
- st.title("BOQ Agent")
9
- st.markdown("Upload a tender PDF to extract the Bill of Quantities (BOQ) and chat with the document.")
10
-
11
- # Initialize session state
12
- if "qa_chain" not in st.session_state:
13
- st.session_state.qa_chain = None
14
- if "extracted_boq" not in st.session_state:
15
- st.session_state.extracted_boq = None
16
- if "chunks" not in st.session_state:
17
- st.session_state.chunks = None
18
- if "vector_store" not in st.session_state:
19
- st.session_state.vector_store = None
20
-
21
- if "messages" not in st.session_state:
22
- st.session_state.messages = []
23
-
24
- # Sidebar
25
- with st.sidebar:
26
- st.header("About")
27
- st.markdown("""
28
- - Upload a PDF containing BOQ data.
29
- - Automatically extract the BOQ.
30
- - Ask questions about the document.
31
- """)
32
-
33
- # File uploader
34
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
35
-
36
- if uploaded_file is not None:
37
- if st.button("Generate BOQ"):
38
- with st.spinner("Generating BOQ..."):
39
- # Save uploaded file to a temp file
40
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
41
- tmp_file.write(uploaded_file.getvalue())
42
- tmp_path = tmp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  try:
45
- # Process
46
- chunks = boq_processor.load_and_process_pdf(tmp_path, filename=uploaded_file.name)
47
- vector_store = boq_processor.create_vector_store(chunks)
48
- qa_chain = boq_processor.setup_rag_chain(vector_store)
 
 
 
 
 
 
 
 
49
 
50
- # Use comprehensive extraction for complete BOQ coverage
51
- extracted_boq = boq_processor.extract_boq_comprehensive(chunks, vector_store)
 
 
 
 
 
52
 
53
  # Store in session
54
- st.session_state.qa_chain = qa_chain
55
- st.session_state.extracted_boq = extracted_boq
56
  st.session_state.chunks = chunks
57
  st.session_state.vector_store = vector_store
58
- st.session_state.messages = [] # reset chat history
59
-
60
- st.success("βœ… BOQ generated successfully!")
61
- except Exception as e:
62
- error_msg = str(e)
63
- if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg or "quota" in error_msg.lower():
64
- st.error(f"⚠️ API Rate Limit Exceeded: You've hit the daily quota limit. Please try again later or upgrade your API plan. See https://ai.google.dev/gemini-api/docs/rate-limits")
65
- else:
66
- st.error(f"Error processing PDF: {error_msg}")
67
  finally:
68
- os.unlink(tmp_path)
 
 
 
 
 
69
 
70
- # Display extracted BOQ
71
- if st.session_state.extracted_boq:
72
- st.subheader("πŸ“Š Extracted Bill of Quantities")
73
-
74
- # Parse and display the extracted BOQ with better formatting
75
- boq_text = st.session_state.extracted_boq
76
 
77
- # Display as markdown for better rendering
78
- st.markdown(boq_text)
 
79
 
80
- st.divider()
 
 
 
 
 
 
 
 
81
 
82
- # Add a download button for the BOQ below
83
- st.download_button(
84
- label="πŸ“₯ Download BOQ as Text",
85
- data=boq_text,
86
- file_name="BOQ_extracted.txt",
87
- mime="text/plain"
88
- )
89
-
90
- # Add consistency check button
91
- if st.button("πŸ” Check BOQ Reliability"):
92
- with st.spinner("Running consistency check..."):
93
  try:
94
- consistency = boq_processor.check_consistency(st.session_state.chunks, st.session_state.vector_store, runs=4)
95
- st.success(f"βœ… Consistency Check Complete")
96
- st.write(f"**Consistency Score:** {consistency['consistency_score']}%")
97
- st.write(f"**Successful Runs:** {consistency['successful_runs']}/{consistency['runs']}")
98
- st.write(f"**Average Confidence:** {consistency['avg_confidence']:.2f} (from {consistency['total_confidence_scores']} scores)")
99
- if consistency['consistency_score'] < 80:
100
- st.warning("⚠️ Low consistency detected. LLM outputs vary significantlyβ€”consider reviewing extractions.")
101
  except Exception as e:
102
- st.error(f"Consistency check failed: {e}")
 
 
 
103
 
104
- st.divider()
105
 
106
- # -------------------------------
107
- # Chat Interface
108
- # -------------------------------
109
- st.subheader("πŸ’¬ Chat with your BOQ")
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- # Display previous chat messages
112
- for message in st.session_state.messages:
113
- with st.chat_message(message["role"]):
114
- st.markdown(message["content"])
115
 
116
- # User input
117
- if prompt := st.chat_input("Ask a question about the BOQ"):
118
- # Add user message
119
- st.session_state.messages.append(
120
- {"role": "user", "content": prompt}
121
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- with st.chat_message("user"):
124
- st.markdown(prompt)
125
 
126
- # Assistant response
127
- with st.chat_message("assistant"):
128
- with st.spinner("Thinking..."):
129
- try:
130
- # OLD LangChain API (ConversationalRetrievalChain)
131
- response = st.session_state.qa_chain(
132
- {"question": prompt}
133
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- answer = response.get("answer", "No response generated.")
136
- st.markdown(answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- st.session_state.messages.append(
139
- {"role": "assistant", "content": answer}
140
- )
141
 
142
- except Exception as e:
143
- st.error(f"Chat error: {e}")
 
1
+ """
2
+ BOQTenders Streamlit Application
3
+
4
+ Interactive web interface for BOQ extraction and document chat.
5
+
6
+ Usage:
7
+ streamlit run streamlit_app_new.py
8
+ """
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ # Add project root to path
13
+ sys.path.insert(0, str(Path(__file__).parent))
14
+
15
  import tempfile
16
+ import streamlit as st
17
+ from loguru import logger
18
+
19
+ from config.settings import settings
20
+ from core.pdf_extractor import PDFExtractor
21
+ from core.embeddings import EmbeddingService
22
+ from core.rag_chain import RAGChainBuilder
23
+ from services.boq_extractor import BOQExtractor
24
+ from services.consistency import ConsistencyChecker
25
+
26
+ # Configure logging
27
+ logger.remove()
28
+ logger.add(
29
+ sys.stderr,
30
+ level=settings.log_level,
31
+ format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
32
+ )
33
+
34
+
35
+ def initialize_services():
36
+ """Initialize all services (cached)."""
37
+ if "services_initialized" not in st.session_state:
38
+ st.session_state.pdf_extractor = PDFExtractor()
39
+ st.session_state.embedding_service = EmbeddingService()
40
+ st.session_state.rag_builder = RAGChainBuilder()
41
+ st.session_state.boq_extractor = BOQExtractor()
42
+ st.session_state.consistency_checker = ConsistencyChecker()
43
+ st.session_state.services_initialized = True
44
+
45
+
46
+ def initialize_session_state():
47
+ """Initialize Streamlit session state variables."""
48
+ defaults = {
49
+ "boq_output": None,
50
+ "qa_chain": None,
51
+ "vector_store": None,
52
+ "chunks": None,
53
+ "chat_history": [],
54
+ "document_loaded": False,
55
+ }
56
+
57
+ for key, value in defaults.items():
58
+ if key not in st.session_state:
59
+ st.session_state[key] = value
60
+
61
+
62
+ def process_pdf(uploaded_file) -> bool:
63
+ """
64
+ Process uploaded PDF file.
65
+
66
+ Returns:
67
+ True if processing succeeded, False otherwise.
68
+ """
69
+ try:
70
+ with st.spinner("Processing PDF..."):
71
+ # Save to temp file
72
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
73
+ temp_file.write(uploaded_file.getvalue())
74
+ temp_path = temp_file.name
75
 
76
  try:
77
+ # Extract text
78
+ st.info("Extracting text from PDF...")
79
+ text = st.session_state.pdf_extractor.extract_text(temp_path, filename=uploaded_file.name)
80
+
81
+ if not text:
82
+ st.error("Could not extract text from PDF")
83
+ return False
84
+
85
+ # Create embeddings
86
+ st.info("Creating embeddings...")
87
+ chunks = st.session_state.embedding_service.split_text(text)
88
+ vector_store = st.session_state.embedding_service.create_vector_store(chunks)
89
 
90
+ # Extract BOQ
91
+ st.info("Extracting BOQ items...")
92
+ boq_output = st.session_state.boq_extractor.extract(chunks, vector_store)
93
+
94
+ # Build QA chain
95
+ st.info("Building chat interface...")
96
+ qa_chain = st.session_state.rag_builder.build(vector_store)
97
 
98
  # Store in session
 
 
99
  st.session_state.chunks = chunks
100
  st.session_state.vector_store = vector_store
101
+ st.session_state.boq_output = boq_output
102
+ st.session_state.qa_chain = qa_chain
103
+ st.session_state.document_loaded = True
104
+ st.session_state.chat_history = []
105
+
106
+ st.success(f"βœ… Processed {len(chunks)} document chunks")
107
+ return True
108
+
 
109
  finally:
110
+ Path(temp_path).unlink(missing_ok=True)
111
+
112
+ except Exception as e:
113
+ logger.error(f"Error processing PDF: {e}")
114
+ st.error(f"Error processing PDF: {str(e)}")
115
+ return False
116
 
117
+
118
+ def render_chat_interface():
119
+ """Render the chat interface."""
120
+ st.subheader("πŸ’¬ Chat with Document")
 
 
121
 
122
+ if not st.session_state.document_loaded:
123
+ st.info("Please upload a PDF to enable chat")
124
+ return
125
 
126
+ # Chat history
127
+ for message in st.session_state.chat_history:
128
+ role = message["role"]
129
+ content = message["content"]
130
+
131
+ if role == "user":
132
+ st.chat_message("user").write(content)
133
+ else:
134
+ st.chat_message("assistant").write(content)
135
 
136
+ # Chat input
137
+ if prompt := st.chat_input("Ask a question about the document..."):
138
+ st.session_state.chat_history.append({"role": "user", "content": prompt})
139
+ st.chat_message("user").write(prompt)
140
+
141
+ with st.spinner("Thinking..."):
 
 
 
 
 
142
  try:
143
+ response = st.session_state.qa_chain({"question": prompt})
144
+ answer = response.get("answer", "I couldn't find an answer.")
145
+
146
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
147
+ st.chat_message("assistant").write(answer)
148
+
 
149
  except Exception as e:
150
+ logger.error(f"Chat error: {e}")
151
+ error_msg = f"Error: {str(e)}"
152
+ st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
153
+ st.chat_message("assistant").write(error_msg)
154
 
 
155
 
156
+ def render_boq_output():
157
+ """Render the BOQ output."""
158
+ st.subheader("πŸ“‹ Extracted BOQ")
159
+
160
+ if st.session_state.boq_output:
161
+ st.markdown(st.session_state.boq_output)
162
+
163
+ # Download button
164
+ st.download_button(
165
+ label="πŸ“₯ Download BOQ as Markdown",
166
+ data=st.session_state.boq_output,
167
+ file_name="boq_output.md",
168
+ mime="text/markdown"
169
+ )
170
+ else:
171
+ st.info("Upload a PDF to see extracted BOQ items")
172
 
 
 
 
 
173
 
174
+ def render_consistency_check():
175
+ """Render consistency check interface."""
176
+ st.subheader("πŸ” Consistency Check")
177
+
178
+ if not st.session_state.document_loaded:
179
+ st.info("Upload a PDF to run consistency checks")
180
+ return
181
+
182
+ runs = st.number_input(
183
+ "Number of extraction runs",
184
+ min_value=2,
185
+ max_value=10,
186
+ value=settings.consistency.default_runs,
187
+ step=1
188
+ )
189
+
190
+ if st.button("Run Consistency Check"):
191
+ with st.spinner(f"Running {runs} extraction passes..."):
192
+ try:
193
+ result = st.session_state.consistency_checker.check(
194
+ chunks=st.session_state.chunks,
195
+ vector_store=st.session_state.vector_store,
196
+ runs=runs
197
+ )
198
+
199
+ col1, col2, col3 = st.columns(3)
200
+
201
+ with col1:
202
+ st.metric("Consistency Score", f"{result['consistency_score']:.1f}%")
203
+
204
+ with col2:
205
+ st.metric("Avg Confidence", f"{result['avg_confidence']:.1f}%")
206
+
207
+ with col3:
208
+ st.metric("Successful Runs", f"{result['successful_runs']}/{result['runs']}")
209
+
210
+ if result['is_low_consistency']:
211
+ st.warning("⚠️ Low consistency detected. Results may vary.")
212
+ else:
213
+ st.success("βœ… Good consistency across extraction runs")
214
+
215
+ except Exception as e:
216
+ logger.error(f"Consistency check error: {e}")
217
+ st.error(f"Error: {str(e)}")
218
 
 
 
219
 
220
+ def render_sidebar():
221
+ """Render the sidebar."""
222
+ with st.sidebar:
223
+ st.title("πŸ“„ BOQ Extractor")
224
+ st.markdown("---")
225
+
226
+ # File upload
227
+ uploaded_file = st.file_uploader(
228
+ "Upload PDF Document",
229
+ type=["pdf"],
230
+ help="Upload a tender/BOQ document for extraction"
231
+ )
232
+
233
+ if uploaded_file:
234
+ if st.button("πŸš€ Process Document"):
235
+ process_pdf(uploaded_file)
236
+
237
+ st.markdown("---")
238
+
239
+ # Clear session
240
+ if st.button("πŸ—‘οΈ Clear Session"):
241
+ for key in list(st.session_state.keys()):
242
+ if key != "services_initialized":
243
+ del st.session_state[key]
244
+ initialize_session_state()
245
+ st.success("Session cleared!")
246
+ st.rerun()
247
+
248
 
249
+ def main():
250
+ """Main application entry point."""
251
+ # Page config
252
+ st.set_page_config(
253
+ page_title=settings.streamlit.page_title,
254
+ page_icon=settings.streamlit.page_icon,
255
+ layout=settings.streamlit.layout,
256
+ initial_sidebar_state="expanded"
257
+ )
258
+
259
+ # Add CSS for sticky tabs
260
+ st.markdown("""
261
+ <style>
262
+ /* Make tabs sticky at top */
263
+ .stTabs [data-baseweb="tab-list"] {
264
+ position: sticky;
265
+ top: 0;
266
+ background-color: white;
267
+ z-index: 999;
268
+ padding-top: 1rem;
269
+ padding-bottom: 0.5rem;
270
+ border-bottom: 1px solid #e6e6e6;
271
+ }
272
+
273
+ /* Dark mode support */
274
+ @media (prefers-color-scheme: dark) {
275
+ .stTabs [data-baseweb="tab-list"] {
276
+ background-color: #0e1117;
277
+ border-bottom: 1px solid #333;
278
+ }
279
+ }
280
+
281
+ /* Streamlit dark theme */
282
+ [data-theme="dark"] .stTabs [data-baseweb="tab-list"] {
283
+ background-color: #0e1117;
284
+ border-bottom: 1px solid #333;
285
+ }
286
+ </style>
287
+ """, unsafe_allow_html=True)
288
+
289
+ # Initialize
290
+ initialize_services()
291
+ initialize_session_state()
292
+
293
+ # Render sidebar
294
+ render_sidebar()
295
+
296
+ # Main content tabs
297
+ tab1, tab2, tab3 = st.tabs(["πŸ“‹ BOQ Output", "πŸ’¬ Chat", "πŸ” Analysis"])
298
+
299
+ with tab1:
300
+ render_boq_output()
301
+
302
+ with tab2:
303
+ render_chat_interface()
304
+
305
+ with tab3:
306
+ render_consistency_check()
307
 
 
 
 
308
 
309
+ if __name__ == "__main__":
310
+ main()