Spaces:
Configuration error
Configuration error
File size: 1,832 Bytes
78362e2 627d6ac 78362e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from langchain_pinecone import PineconeVectorStore
from datasets import load_dataset
from langchain.schema import Document
from src.helper import load_pdf_files, filter_to_minimal_docs, text_splitter, download_embeddings
from pinecone import Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
load_dotenv()
import os
Pinecone_API_KEY = os.getenv("Pinecone_API_KEY")
os.environ["PINECONE_API_KEY"] = Pinecone_API_KEY
#extracted_data= load_pdf_files("data")
#minimal_docs = filter_to_minimal_docs(extracted_data)
#text_chunks = text_splitter(minimal_docs)
embeddings = download_embeddings()
Pinecone_API_KEY=Pinecone_API_KEY
pc = Pinecone(api_key=Pinecone_API_KEY)
index_name = "doctor-ai"
if not pc.has_index(index_name):
pc.create_index(
name=index_name,
dimension=384,
metric="cosine",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
))
index = pc.Index(index_name)
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
docs = []
for item in dataset["train"]:
question = item.get("instruction", "") + " " + item.get("input", "")
answer = item.get("output", "")
docs.append(Document(
page_content=f"Q: {question}\nA: {answer}",
metadata={"source": "huggingface_lavita/ChatDoctor-HealthCareMagic-100k"}
))
#docssearch=PineconeVectorStore.from_documents(
# documents=text_chunks,
# embedding=embeddings,
# index_name=index_name
#)
docsearch=PineconeVectorStore.from_existing_index(
index_name=index_name,
embedding=embeddings
)
def chunked_upload(docs, batch_size=50):
for i in range(0, len(docs), batch_size):
batch = docs[i:i+batch_size]
docsearch.add_documents(batch)
chunked_upload(docs, batch_size=50)
docsearch.add_documents(docs) |