File size: 2,618 Bytes
0fc1003
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import logging
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
import json

# Set up logging configuration
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Get a logger for this module
logger = logging.getLogger(__name__)

working_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(working_dir)
data_dir = f"{parent_dir}/"
vector_db_dir = f"{parent_dir}/vector_db"


logger.info("Reading Files Process Started...")
all_records = []

# loop through all files
for file_name in os.listdir(data_dir):
    if file_name.endswith(".json"):
        file_path = os.path.join(data_dir, file_name)

        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

            # if JSON contains list of records
            if isinstance(data, list):
                all_records.extend(data)
            else:
                all_records.append(data)

print("Total drug records:", len(all_records))

documents = []

for record in data:

    drug = record.get("generic_name", ["UNKNOWN"])[0].upper()

    # choose sections you want in RAG
    sections = [
        "indications_and_usage",
        "warnings_and_cautions",
        "adverse_reactions",
        "drug_interactions"
    ]

    for section in sections:
        if section in record:

            for text in record[section]:

                documents.append(
                    Document(
                        page_content=text,
                        metadata={
                            "generic_name": drug,
                            "section": section
                        }
                    )
                )

print("Documents created:", len(documents))

logger.info("Split chunk Files Process Started...")
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

chunked_docs = splitter.split_documents(documents)

print("Chunks created:", len(chunked_docs))

logger.info("Embeddings Files Process Started...")

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
#%%
print("Chroma ready ✅")

logger.info(" VectorDB Process Started...")
vectordb = Chroma.from_documents(
    documents=chunked_docs,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

print("Vector DB created successfully ✅")
logger.info("VectorDB Process Completed...")