File size: 2,276 Bytes
b6033dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from huggingface_hub import hf_hub_download
from llama_index.core import Document
import json
import pandas as pd
import os
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


def create_documents():

    qdrant_key = os.getenv('Qdrant_key')
    knowledge_base_1 = hf_hub_download(
        repo_id="rbiswasfc/arxiv-papers",
        filename="data/train-00000-of-00001.parquet",  # actual data file
        repo_type="dataset",
    )

    documents = []
    df = pd.read_parquet(knowledge_base_1)

    for _, row in df.iterrows():
        text = row["abstract"]  # or any text column
        documents.append(
            Document(
                text=text,
                metadata={
                    "title": row.get("title"),
                }
            )
        )

    knowledge_base_2 = hf_hub_download(
        repo_id="jamescalam/ai-arxiv",
        filename="train.jsonl",
        repo_type="dataset",
    )

    with open(knowledge_base_2, "r") as f:
        for line in f:
            data = json.loads(line)

            doc = Document(
                text=data["content"],
                metadata={
                    "title": data.get("title"),
                }
            )

            documents.append(doc)
            return documents

def ingest_documents():
    from qdrant_client import QdrantClient

    qdrant_client = QdrantClient(
        url="https://afc34f29-812e-40ea-b515-a8cc6ae9ed37.us-east4-0.gcp.cloud.qdrant.io:6333",
        api_key=qdrant_key,
    )

    embed_model = HuggingFaceEmbedding(
        model_name="BAAI/bge-small-en-v1.5",
    )
    docs = create_documents()


    vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name="ai_tutor_knowledge",
    )

    index = VectorStoreIndex.from_documents(
        docs,
        storage_context=StorageContext.from_defaults(
            vector_store=vector_store
        ),
        embed_model=embed_model,
        transformations=[SentenceSplitter(chunk_size=2000, chunk_overlap=64)],
        show_progress=True,
    )