Spaces:
Sleeping
Sleeping
| from huggingface_hub import hf_hub_download | |
| from llama_index.core import Document | |
| import json | |
| import pandas as pd | |
| import os | |
| from llama_index.core import VectorStoreIndex | |
| from llama_index.core import StorageContext | |
| from llama_index.core.node_parser import SentenceSplitter | |
| from llama_index.vector_stores.qdrant import QdrantVectorStore | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| def create_documents(): | |
| qdrant_key = os.getenv('Qdrant_key') | |
| knowledge_base_1 = hf_hub_download( | |
| repo_id="rbiswasfc/arxiv-papers", | |
| filename="data/train-00000-of-00001.parquet", # actual data file | |
| repo_type="dataset", | |
| ) | |
| documents = [] | |
| df = pd.read_parquet(knowledge_base_1) | |
| for _, row in df.iterrows(): | |
| text = row["abstract"] # or any text column | |
| documents.append( | |
| Document( | |
| text=text, | |
| metadata={ | |
| "title": row.get("title"), | |
| } | |
| ) | |
| ) | |
| knowledge_base_2 = hf_hub_download( | |
| repo_id="jamescalam/ai-arxiv", | |
| filename="train.jsonl", | |
| repo_type="dataset", | |
| ) | |
| with open(knowledge_base_2, "r") as f: | |
| for line in f: | |
| data = json.loads(line) | |
| doc = Document( | |
| text=data["content"], | |
| metadata={ | |
| "title": data.get("title"), | |
| } | |
| ) | |
| documents.append(doc) | |
| return documents | |
| def ingest_documents(): | |
| from qdrant_client import QdrantClient | |
| qdrant_client = QdrantClient( | |
| url="https://afc34f29-812e-40ea-b515-a8cc6ae9ed37.us-east4-0.gcp.cloud.qdrant.io:6333", | |
| api_key=qdrant_key, | |
| ) | |
| embed_model = HuggingFaceEmbedding( | |
| model_name="BAAI/bge-small-en-v1.5", | |
| ) | |
| docs = create_documents() | |
| vector_store = QdrantVectorStore( | |
| client=qdrant_client, | |
| collection_name="ai_tutor_knowledge", | |
| ) | |
| index = VectorStoreIndex.from_documents( | |
| docs, | |
| storage_context=StorageContext.from_defaults( | |
| vector_store=vector_store | |
| ), | |
| embed_model=embed_model, | |
| transformations=[SentenceSplitter(chunk_size=2000, chunk_overlap=64)], | |
| show_progress=True, | |
| ) |