from .llm import nemotron_llama from .embeddings import get_embeddings from .retriever import vector_db_retriever import pickle import os BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) def get_path(folder, filename): # Try the standard 'volumes/' path (local structure) path1 = os.path.join(BASE_DIR, "volumes", folder, filename) if os.path.exists(path1): return path1 # Try the root-level path (Hugging Face structure) path2 = os.path.join(BASE_DIR, folder, filename) if os.path.exists(path2): return path2 # Default to path1 if neither exists, so it raises the error on the first expected path return path1 pkl_path = get_path("metadata", "new_pdfs_corpus_data.pkl") if not os.path.exists(pkl_path): print(f"⚠️ ERROR: Metadata file not found at {pkl_path}") else: with open(pkl_path, "rb") as p: metadata = pickle.load(p) # ids = list(metadata.keys()) # def RAG(query, chat_history): # query_embeddings = get_embeddings([query]) # result = vector_db_retriever(query_embeddings, 10) # indexes = result[0][0] # context = "" # for idx in indexes: # hash_id = ids[idx] # retrieved_results = metadata[hash_id] # context+="Title:"+retrieved_results['title']+"\n"+"Date:"+retrieved_results['date']+"\n"+"Page Number:"+str(retrieved_results['page_no'])+"\n"+"Corpus:"+retrieved_results['text']+"\n\n" # completion = nemotron_llama(query, context, chat_history) # # for chunk in completion: # # if chunk.choices[0].delta.content is not None: # # print(chunk.choices[0].delta.content, end = '') # return completion # RAG("explain the seventh amentment act", chat_history=[]) import re as _re def _clean_corpus(text: str) -> str: """Collapse PDF extraction artifacts: newlines between words become spaces, but preserve intentional paragraph breaks (two+ newlines).""" # Preserve double newlines (paragraph breaks) as a placeholder text = text.replace('\r\n', '\n').replace('\r', '\n') # Replace single newlines (mid-sentence line-wraps from PDF) with a space text = _re.sub(r'(?= 2: continue clean_paragraph = _clean_corpus(retrieved_results['paragraphs']) context += f"Title: {title}\nPage Number: {retrieved_results['page']}\nCorpus: {clean_paragraph}\n\n" title_counts[title] = title_counts.get(title, 0) + 1 chunks_added += 1 # Stop once we have 10 highly diverse chunks if chunks_added >= 10: break completion = nemotron_llama(query, context, chat_history, role=role) # for chunk in completion: # if chunk.choices[0].delta.content is not None: # print(chunk.choices[0].delta.content, end = '') return completion print("imported sucessfully")