from huggingface_hub import hf_hub_download from llama_index.core import Document import json import pandas as pd import os from llama_index.core import VectorStoreIndex from llama_index.core import StorageContext from llama_index.core.node_parser import SentenceSplitter from llama_index.vector_stores.qdrant import QdrantVectorStore from llama_index.embeddings.huggingface import HuggingFaceEmbedding def create_documents(): qdrant_key = os.getenv('Qdrant_key') knowledge_base_1 = hf_hub_download( repo_id="rbiswasfc/arxiv-papers", filename="data/train-00000-of-00001.parquet", # actual data file repo_type="dataset", ) documents = [] df = pd.read_parquet(knowledge_base_1) for _, row in df.iterrows(): text = row["abstract"] # or any text column documents.append( Document( text=text, metadata={ "title": row.get("title"), } ) ) knowledge_base_2 = hf_hub_download( repo_id="jamescalam/ai-arxiv", filename="train.jsonl", repo_type="dataset", ) with open(knowledge_base_2, "r") as f: for line in f: data = json.loads(line) doc = Document( text=data["content"], metadata={ "title": data.get("title"), } ) documents.append(doc) return documents def ingest_documents(): from qdrant_client import QdrantClient qdrant_client = QdrantClient( url="https://afc34f29-812e-40ea-b515-a8cc6ae9ed37.us-east4-0.gcp.cloud.qdrant.io:6333", api_key=qdrant_key, ) embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-small-en-v1.5", ) docs = create_documents() vector_store = QdrantVectorStore( client=qdrant_client, collection_name="ai_tutor_knowledge", ) index = VectorStoreIndex.from_documents( docs, storage_context=StorageContext.from_defaults( vector_store=vector_store ), embed_model=embed_model, transformations=[SentenceSplitter(chunk_size=2000, chunk_overlap=64)], show_progress=True, )