cortexa-ai / main.py
Jay-10020's picture
docker implementation with hugging face
2b523d0
"""
Main script for testing RAG system
"""
from pathlib import Path
from typing import List
from config import DOCUMENTS_DIR
from vectordb.document_processor import DocumentProcessor
from vectordb.json_store import get_json_store # Changed
from rag.retriever import get_retriever
from rag.generator import get_generator
def load_documents(file_paths: List[str]):
"""Load documents into JSON store"""
print("\n" + "="*60)
print("LOADING DOCUMENTS")
print("="*60)
processor = DocumentProcessor()
vector_store = get_json_store()
for file_path in file_paths:
print(f"\nProcessing: {file_path}")
chunks = processor.process_document(file_path)
print(f"βœ“ Created {len(chunks)} chunks")
texts = [chunk.text for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]
ids = [f"{Path(file_path).stem}_{i}" for i in range(len(chunks))]
vector_store.add_documents(texts, metadatas, ids)
stats = vector_store.get_stats()
print(f"\nβœ“ Total chunks in store: {stats['total_documents']}")
print(f"βœ“ JSON file size: {stats['file_size_mb']:.2f} MB")
# Export chunks only (without embeddings)
vector_store.export_chunks_only()
def query_system(query: str):
"""Query the RAG system"""
print("\n" + "="*60)
print(f"QUERY: {query}")
print("="*60)
retriever = get_retriever()
generator = get_generator()
print("\nπŸ” Retrieving relevant documents...")
retrieved_docs = retriever.retrieve(query)
print(f"βœ“ Found {len(retrieved_docs)} relevant chunks")
for i, doc in enumerate(retrieved_docs, 1):
print(f"\n[{i}] {doc['source']} (Chunk {doc['chunk_index']}, Similarity: {doc['similarity']:.3f})")
print(f"Preview: {doc['text'][:150]}...")
print("\nπŸ’¬ Generating response...")
context = retriever.format_context(retrieved_docs)
answer = generator.generate_response(query, context)
print("\n" + "-"*60)
print("ANSWER:")
print("-"*60)
print(answer)
print("-"*60)
def interactive_mode():
"""Interactive query mode"""
print("\n" + "="*60)
print("INTERACTIVE MODE")
print("="*60)
print("Commands:")
print(" - Type your question to query")
print(" - Type 'stats' to see store statistics")
print(" - Type 'quit' or 'exit' to stop")
print("="*60 + "\n")
vector_store = get_json_store()
while True:
query = input("\nπŸ’¬ Your question: ").strip()
if query.lower() in ['quit', 'exit', 'q']:
print("Goodbye!")
break
if query.lower() == 'stats':
stats = vector_store.get_stats()
print("\nπŸ“Š Store Statistics:")
for key, value in stats.items():
print(f" {key}: {value}")
continue
if not query:
continue
query_system(query)
def main():
"""Main function"""
print("\nπŸš€ Cortexa RAG System (JSON Storage)")
print("="*60)
docs = list(DOCUMENTS_DIR.glob("*"))
docs = [d for d in docs if d.suffix in ['.pdf', '.txt', '.docx']]
if not docs:
print(f"\n⚠️ No documents found in {DOCUMENTS_DIR}")
print("Please add PDF, TXT, or DOCX files to the documents folder.")
return
print(f"\nπŸ“„ Found {len(docs)} documents:")
for doc in docs:
print(f" - {doc.name}")
load_choice = input("\nLoad documents into store? (y/n): ").strip().lower()
if load_choice == 'y':
load_documents([str(d) for d in docs])
print("\nStarting interactive query mode...")
interactive_mode()
if __name__ == "__main__":
main()