Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| from pathlib import Path | |
| # Add src directory to path | |
| sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) | |
| from qdrant_setup import QdrantSetup | |
| from document_ingestor import DocumentIngestor | |
| from load_huggingface_dataset import HuggingFaceDatasetLoader | |
| from embedding_generator import get_embedding_function | |
| def setup_and_run(): | |
| """ | |
| Main function to setup and run the Hindi RAG system | |
| """ | |
| print("Setting up Hindi RAG System...") | |
| # Step 1: Initialize Qdrant | |
| print("\n1. Setting up Qdrant...") | |
| qdrant_setup = QdrantSetup() | |
| qdrant_setup.create_collection() | |
| # Step 2: Load and ingest documents from HuggingFace datasets | |
| print("\n2. Loading datasets from HuggingFace...") | |
| qdrant_client = qdrant_setup.get_client() | |
| collection_name = qdrant_setup.get_collection_name() | |
| embedding_func = get_embedding_function() | |
| hf_loader = HuggingFaceDatasetLoader(qdrant_client, collection_name) | |
| num_hf_docs = hf_loader.load_and_ingest_all_configured(embedding_func) | |
| if num_hf_docs > 0: | |
| print(f" ✓ Loaded {num_hf_docs} new documents from HuggingFace datasets") | |
| else: | |
| hf_datasets = os.getenv("HF_DATASETS", "") | |
| if hf_datasets: | |
| print(f" ⊘ HuggingFace datasets already ingested (skipped)") | |
| else: | |
| print(" No HuggingFace datasets configured (set HF_DATASETS env variable)") | |
| # Step 3: Load and ingest local documents | |
| print("\n3. Loading and ingesting local documents...") | |
| ingestor = DocumentIngestor(qdrant_client, collection_name) | |
| # Use the data directory | |
| data_dir = "./data" | |
| if os.path.exists(data_dir): | |
| num_docs = ingestor.load_and_ingest(data_dir, embedding_func) | |
| print(f" Loaded {num_docs} local documents into Qdrant") | |
| else: | |
| print(f" Warning: Data directory '{data_dir}' does not exist") | |
| # Step 4: Initialize RAG system with LLM provider from environment | |
| print("\n4. Initializing RAG system...") | |
| llm_provider = os.getenv("LLM_PROVIDER", "huggingface") | |
| print(f" Using LLM provider: {llm_provider}") | |
| print(" (RAG system will be initialized by frontend on first request)") | |
| print("\n5. Hindi RAG System setup complete!") | |
| print(" Starting Streamlit frontend...") | |
| if __name__ == "__main__": | |
| setup_and_run() |