BOQ_of_Tenders_Agent / services /consistency.py
Sahil Garg
improvised output table quality and columns
ae1d55a
"""
Consistency checking service for evaluating BOQ extraction reliability.
"""
from typing import List, Dict, Any
from difflib import SequenceMatcher
from loguru import logger
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from config.settings import settings
from services.boq_extractor import BOQExtractor
class ConsistencyChecker:
"""
Checks consistency of BOQ extractions from outputs.
Example:
checker = ConsistencyChecker()
result = checker.check_from_outputs(outputs)
print(f"Consistency: {result['consistency_score']}%")
"""
def __init__(self, boq_extractor: BOQExtractor, default_runs: int = None, low_threshold: float = None):
"""
Initialize consistency checker.
Args:
boq_extractor: BOQ extractor instance. Required.
default_runs: Default number of extraction runs. Defaults to config value.
low_threshold: Threshold for low consistency warning. Defaults to config value.
"""
self.boq_extractor = boq_extractor
self.default_runs = default_runs or settings.consistency.default_runs
self.low_threshold = low_threshold or settings.consistency.low_consistency_threshold
def _calculate_similarity(self, results: List[str]) -> float:
"""
Calculate average pairwise similarity between results.
Args:
results: List of BOQ extraction results.
Returns:
Average similarity score (0.0 to 1.0).
"""
similarities = []
for i in range(len(results)):
for j in range(i + 1, len(results)):
if results[i] and results[j]:
sim = SequenceMatcher(None, results[i], results[j]).ratio()
similarities.append(sim)
return sum(similarities) / len(similarities) if similarities else 0
def _extract_confidence_scores(self, boq: str) -> List[float]:
"""
Extract confidence scores from BOQ output.
Args:
boq: Formatted BOQ output string.
Returns:
List of confidence score values.
"""
if not boq:
return []
lines = boq.split('\n')
confidence_idx = None
confidences = []
# Find confidence column index from header
for line in lines:
line = line.strip()
if '|' in line and 'Confidence' in line and not line.startswith('| ---'):
parts = [p.strip() for p in line.split('|')[1:-1]]
confidence_idx = next(
(i for i, p in enumerate(parts) if 'Confidence' in p),
None
)
if confidence_idx is not None:
break
if confidence_idx is None:
return []
# Extract confidence values from data rows
for line in lines:
if '|' in line and not line.startswith('| ---') and 'Confidence' not in line:
parts = [p.strip() for p in line.split('|')[1:-1]]
if len(parts) > confidence_idx:
try:
conf_str = parts[confidence_idx]
if conf_str and conf_str not in ('NA', 'N/A', '-'):
conf_str = conf_str.rstrip('%')
conf = float(conf_str)
confidences.append(conf)
except (ValueError, IndexError):
pass
return confidences
def check_from_outputs(self, outputs: List[str]) -> Dict[str, Any]:
"""
Compute consistency metrics from a list of BOQ outputs.
Args:
outputs: List of BOQ output strings.
Returns:
Dictionary with consistency metrics.
"""
logger.info(f'Computing consistency from {len(outputs)} outputs')
# Calculate similarity
avg_similarity = self._calculate_similarity(outputs)
consistency_score = avg_similarity * 100
# Extract and average confidence scores
all_confidences = []
for boq in outputs:
confidences = self._extract_confidence_scores(boq)
all_confidences.extend(confidences)
avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0
successful_runs = len([r for r in outputs if r])
result = {
"consistency_score": round(consistency_score, 2),
"runs": len(outputs),
"successful_runs": successful_runs,
"avg_similarity": round(avg_similarity, 2),
"avg_confidence": round(avg_confidence, 2),
"total_confidence_scores": len(all_confidences)
}
logger.info(f'Consistency check from outputs completed: {result}')
return result