File size: 2,883 Bytes
c7a6fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
# Environment Setup
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import chromadb
from chromadb.utils import embedding_functions
from datasets import load_dataset
import argparse

# 1. Setup
parser = argparse.ArgumentParser()
parser.add_argument("--lang", type=str, default="pt", help="language code")
args = parser.parse_args()
lang_code = args.lang

db_path = f"/home/mshahidul/readctrl/data/vector_db/{lang_code}_v2"

# 2. Initialize Client and Embedding Function
client = chromadb.PersistentClient(path=db_path)
# Qwen3-Embedding-4B is heavy; ensure your GPU has ~10GB+ VRAM
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name='Qwen/Qwen3-Embedding-4B',
    device="cuda"
)

collection = client.get_or_create_collection(name="wiki_collection", embedding_function=ef)

# 3. Logic to Add New Data
if collection.count() == 0:
    print(f"Database empty. Processing Wikipedia ({lang_code})...")
    
    # Use streaming to avoid loading the whole dataset into RAM
    ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True)
    
    batch_docs = []
    batch_ids = []
    chunk_count = 0
    # Process a subset (e.g., 50,000 articles) to avoid massive processing times
    # 1,000,000 articles might result in 10,000,000+ chunks.
    max_articles = 500000
    import tqdm
    for i, item in tqdm.tqdm(enumerate(ds.take(max_articles))):
        text = item['text']
        # Simple paragraph chunking
        paragraphs = [p.strip() for p in text.split('\n\n') if len(p.split()) > 20]
        
        for p_idx, para in tqdm.tqdm(enumerate(paragraphs)):
            batch_docs.append(para)
            batch_ids.append(f"art_{i}_p_{p_idx}")
            
            # 4. Batch Upload to Chroma (Every 100 chunks)
            # This prevents memory overflow and allows for incremental saving
            if len(batch_docs) >= 100:
                collection.add(
                    documents=batch_docs,
                    ids=batch_ids
                )
                chunk_count += len(batch_docs)
                batch_docs = []
                batch_ids = []
                
        if i % 500 == 0:
            print(f"Processed {i} articles... Total chunks in DB: {collection.count()}")

    # Add remaining documents
    if batch_docs:
        collection.add(documents=batch_docs, ids=batch_ids)
    
    print(f"Finished! Total documents in DB: {collection.count()}")
else:
    print(f"Database already exists with {collection.count()} documents. Loading...")

# 5. Search
query = "Tell me about history" # Adjust based on your language
results = collection.query(
    query_texts=[query],
    n_results=3
)

print(f"\nQuery: {query}")
for i, doc in enumerate(results['documents'][0]):
    print(f"Result {i+1}: {doc[:200]}...") # Print first 200 chars