DoctorAI / store_index.py
ManailFatima's picture
Update store_index.py
627d6ac verified
from langchain_pinecone import PineconeVectorStore
from datasets import load_dataset
from langchain.schema import Document
from src.helper import load_pdf_files, filter_to_minimal_docs, text_splitter, download_embeddings
from pinecone import Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
load_dotenv()
import os
Pinecone_API_KEY = os.getenv("Pinecone_API_KEY")
os.environ["PINECONE_API_KEY"] = Pinecone_API_KEY
#extracted_data= load_pdf_files("data")
#minimal_docs = filter_to_minimal_docs(extracted_data)
#text_chunks = text_splitter(minimal_docs)
embeddings = download_embeddings()
Pinecone_API_KEY=Pinecone_API_KEY
pc = Pinecone(api_key=Pinecone_API_KEY)
index_name = "doctor-ai"
if not pc.has_index(index_name):
pc.create_index(
name=index_name,
dimension=384,
metric="cosine",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
))
index = pc.Index(index_name)
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
docs = []
for item in dataset["train"]:
question = item.get("instruction", "") + " " + item.get("input", "")
answer = item.get("output", "")
docs.append(Document(
page_content=f"Q: {question}\nA: {answer}",
metadata={"source": "huggingface_lavita/ChatDoctor-HealthCareMagic-100k"}
))
#docssearch=PineconeVectorStore.from_documents(
# documents=text_chunks,
# embedding=embeddings,
# index_name=index_name
#)
docsearch=PineconeVectorStore.from_existing_index(
index_name=index_name,
embedding=embeddings
)
def chunked_upload(docs, batch_size=50):
for i in range(0, len(docs), batch_size):
batch = docs[i:i+batch_size]
docsearch.add_documents(batch)
chunked_upload(docs, batch_size=50)
docsearch.add_documents(docs)