File size: 1,912 Bytes
661eb14 a337229 661eb14 a337229 661eb14 a337229 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | from pathlib import Path
from core import audit, vectorstore
from core.chunker import chunk_bidder
from core.ocr_pipeline import extract_document
from core.schemas import Criterion, Evidence
def process_bidder(bidder_id: str, files: list[Path]) -> None:
collection = vectorstore.get_collection("bidder_chunks")
for file in files:
pages = extract_document(file)
chunks = chunk_bidder(pages, bidder_id, file.name)
if not chunks:
continue
metadatas = [
{
"bidder_id": bidder_id,
"doc_name": chunk["doc_name"],
"page": chunk["page"],
"source_type": chunk["source_type"],
"ocr_confidence": float(chunk["ocr_confidence"])
if chunk["ocr_confidence"] is not None else -1.0,
}
for chunk in chunks
]
vectorstore.add_chunks(collection, chunks, metadatas)
audit.log(
"bidder_processed",
bidder_id=bidder_id,
doc_name=file.name,
chunk_count=len(chunks),
)
def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
query_text = f"{criterion.title} {' '.join(criterion.query_hints)}"
collection = vectorstore.get_collection("bidder_chunks")
results = vectorstore.query(
collection, query_text, k=k, where={"bidder_id": bidder_id}
)
evidence = []
for r in results:
meta = r["metadata"]
ocr_conf = meta.get("ocr_confidence")
if ocr_conf is not None and ocr_conf < 0:
ocr_conf = None
evidence.append(Evidence(
bidder_id=bidder_id,
doc_name=meta["doc_name"],
page=meta["page"],
text=r["text"],
source_type=meta["source_type"],
ocr_confidence=ocr_conf,
))
return evidence
|