Chatbot / data_collection.py
Priya-0914's picture
Create data_collection.py
b6033dc verified
from huggingface_hub import hf_hub_download
from llama_index.core import Document
import json
import pandas as pd
import os
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
def create_documents():
qdrant_key = os.getenv('Qdrant_key')
knowledge_base_1 = hf_hub_download(
repo_id="rbiswasfc/arxiv-papers",
filename="data/train-00000-of-00001.parquet", # actual data file
repo_type="dataset",
)
documents = []
df = pd.read_parquet(knowledge_base_1)
for _, row in df.iterrows():
text = row["abstract"] # or any text column
documents.append(
Document(
text=text,
metadata={
"title": row.get("title"),
}
)
)
knowledge_base_2 = hf_hub_download(
repo_id="jamescalam/ai-arxiv",
filename="train.jsonl",
repo_type="dataset",
)
with open(knowledge_base_2, "r") as f:
for line in f:
data = json.loads(line)
doc = Document(
text=data["content"],
metadata={
"title": data.get("title"),
}
)
documents.append(doc)
return documents
def ingest_documents():
from qdrant_client import QdrantClient
qdrant_client = QdrantClient(
url="https://afc34f29-812e-40ea-b515-a8cc6ae9ed37.us-east4-0.gcp.cloud.qdrant.io:6333",
api_key=qdrant_key,
)
embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5",
)
docs = create_documents()
vector_store = QdrantVectorStore(
client=qdrant_client,
collection_name="ai_tutor_knowledge",
)
index = VectorStoreIndex.from_documents(
docs,
storage_context=StorageContext.from_defaults(
vector_store=vector_store
),
embed_model=embed_model,
transformations=[SentenceSplitter(chunk_size=2000, chunk_overlap=64)],
show_progress=True,
)