File size: 1,912 Bytes
661eb14
a337229
 
 
 
661eb14
 
 
 
a337229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661eb14
 
 
a337229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from pathlib import Path

from core import audit, vectorstore
from core.chunker import chunk_bidder
from core.ocr_pipeline import extract_document
from core.schemas import Criterion, Evidence


def process_bidder(bidder_id: str, files: list[Path]) -> None:
    collection = vectorstore.get_collection("bidder_chunks")
    for file in files:
        pages = extract_document(file)
        chunks = chunk_bidder(pages, bidder_id, file.name)
        if not chunks:
            continue
        metadatas = [
            {
                "bidder_id": bidder_id,
                "doc_name": chunk["doc_name"],
                "page": chunk["page"],
                "source_type": chunk["source_type"],
                "ocr_confidence": float(chunk["ocr_confidence"])
                if chunk["ocr_confidence"] is not None else -1.0,
            }
            for chunk in chunks
        ]
        vectorstore.add_chunks(collection, chunks, metadatas)
        audit.log(
            "bidder_processed",
            bidder_id=bidder_id,
            doc_name=file.name,
            chunk_count=len(chunks),
        )


def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
    query_text = f"{criterion.title} {' '.join(criterion.query_hints)}"
    collection = vectorstore.get_collection("bidder_chunks")
    results = vectorstore.query(
        collection, query_text, k=k, where={"bidder_id": bidder_id}
    )
    evidence = []
    for r in results:
        meta = r["metadata"]
        ocr_conf = meta.get("ocr_confidence")
        if ocr_conf is not None and ocr_conf < 0:
            ocr_conf = None
        evidence.append(Evidence(
            bidder_id=bidder_id,
            doc_name=meta["doc_name"],
            page=meta["page"],
            text=r["text"],
            source_type=meta["source_type"],
            ocr_confidence=ocr_conf,
        ))
    return evidence