Priya-0914 commited on
Commit
b6033dc
·
verified ·
1 Parent(s): 796621b

Create data_collection.py

Browse files
Files changed (1) hide show
  1. data_collection.py +83 -0
data_collection.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ from llama_index.core import Document
3
+ import json
4
+ import pandas as pd
5
+ import os
6
+ from llama_index.core import VectorStoreIndex
7
+ from llama_index.core import StorageContext
8
+ from llama_index.core.node_parser import SentenceSplitter
9
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
10
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
11
+
12
+
13
+ def create_documents():
14
+
15
+ qdrant_key = os.getenv('Qdrant_key')
16
+ knowledge_base_1 = hf_hub_download(
17
+ repo_id="rbiswasfc/arxiv-papers",
18
+ filename="data/train-00000-of-00001.parquet", # actual data file
19
+ repo_type="dataset",
20
+ )
21
+
22
+ documents = []
23
+ df = pd.read_parquet(knowledge_base_1)
24
+
25
+ for _, row in df.iterrows():
26
+ text = row["abstract"] # or any text column
27
+ documents.append(
28
+ Document(
29
+ text=text,
30
+ metadata={
31
+ "title": row.get("title"),
32
+ }
33
+ )
34
+ )
35
+
36
+ knowledge_base_2 = hf_hub_download(
37
+ repo_id="jamescalam/ai-arxiv",
38
+ filename="train.jsonl",
39
+ repo_type="dataset",
40
+ )
41
+
42
+ with open(knowledge_base_2, "r") as f:
43
+ for line in f:
44
+ data = json.loads(line)
45
+
46
+ doc = Document(
47
+ text=data["content"],
48
+ metadata={
49
+ "title": data.get("title"),
50
+ }
51
+ )
52
+
53
+ documents.append(doc)
54
+ return documents
55
+
56
+ def ingest_documents():
57
+ from qdrant_client import QdrantClient
58
+
59
+ qdrant_client = QdrantClient(
60
+ url="https://afc34f29-812e-40ea-b515-a8cc6ae9ed37.us-east4-0.gcp.cloud.qdrant.io:6333",
61
+ api_key=qdrant_key,
62
+ )
63
+
64
+ embed_model = HuggingFaceEmbedding(
65
+ model_name="BAAI/bge-small-en-v1.5",
66
+ )
67
+ docs = create_documents()
68
+
69
+
70
+ vector_store = QdrantVectorStore(
71
+ client=qdrant_client,
72
+ collection_name="ai_tutor_knowledge",
73
+ )
74
+
75
+ index = VectorStoreIndex.from_documents(
76
+ docs,
77
+ storage_context=StorageContext.from_defaults(
78
+ vector_store=vector_store
79
+ ),
80
+ embed_model=embed_model,
81
+ transformations=[SentenceSplitter(chunk_size=2000, chunk_overlap=64)],
82
+ show_progress=True,
83
+ )