File size: 3,382 Bytes
ea71a81
 
 
 
 
 
af51700
ea71a81
 
af51700
 
 
 
 
ea71a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4df7450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea71a81
 
4df7450
 
ea71a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af51700
 
ea71a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse
from langchain_openai import OpenAIEmbeddings
from src import config
import hashlib
import os

def ingest_file(file_path: str):
    """
    Document ingestion pipeline for GeneSeek.
    Handles loading, chunking, embedding and upserting documents into the Qdrant vector store.
    Duplicate ingestion is prevented via MD5 hash tracking.
    """
    # 1. Duplicate check
    with open(file_path, "rb") as f:
        file_hash = hashlib.md5(f.read()).hexdigest()

    hash_record_path = config.DATA_DIR / "ingested_hashes.txt"

    if hash_record_path.exists():
        with open(hash_record_path, "r") as f:
            known_hashes = f.read().splitlines()
        if file_hash in known_hashes:
            print("File already ingested. Skipping.")
            return False

    # 2. Load file
    print(f"Loading file: {file_path}...")
    if file_path.endswith(".pdf"):
        loader = PyMuPDFLoader(file_path)
    else:
        loader = TextLoader(file_path)

    docs = loader.load()

    def extract_metadata(text):
        metadata = {}

        for line in text.split("\n"):
            if "Clinical Trial ID:" in line:
                metadata["trial_id"] = line.split(":")[-1].strip()
            elif "Study Title:" in line:
                metadata["title"] = line.split(":")[-1].strip()
            elif "Date:" in line:
                metadata["date"] = line.split(":")[-1].strip()

        return metadata

    # Attach metadata to each doc
    for doc in docs:
        extracted = extract_metadata(doc.page_content)
        doc.metadata.update(extracted)

    # 3. Split
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " "]
    )
    splits = text_splitter.split_documents(docs)
    print(f"Split into {len(splits)} chunks.")

    # 4. Initialize embeddings
    dense_embeddings = OpenAIEmbeddings(
        model=config.EMBEDDING_MODEL,
        openai_api_key=config.OPENAI_API_KEY
    )
    sparse_embeddings = FastEmbedSparse(model_name=config.SPARSE_MODEL)

    # 5. Use shared client
    existing = [c.name for c in config.qdrant_client.get_collections().collections]

    if config.COLLECTION_NAME not in existing:
        print("Collection not found. Creating new collection...")
        QdrantVectorStore.from_documents(
            documents=splits,
            embedding=dense_embeddings,
            sparse_embedding=sparse_embeddings,
            url=os.getenv("QDRANT_URL"),
            api_key=os.getenv("QDRANT_API_KEY"),
            collection_name=config.COLLECTION_NAME,
            retrieval_mode="hybrid",
        )
    else:
        print("Collection exists. Appending documents...")
        vector_store = QdrantVectorStore(
            client=config.qdrant_client,
            collection_name=config.COLLECTION_NAME,
            embedding=dense_embeddings,
            sparse_embedding=sparse_embeddings,
            retrieval_mode="hybrid"
        )
        vector_store.add_documents(splits)

    # 6. Record hash
    with open(hash_record_path, "a") as f:
        f.write(file_hash + "\n")

    print("Ingestion complete.")
    return True