Spaces:
Running
Running
File size: 3,794 Bytes
a74720f 535ee95 a74720f 535ee95 a74720f b82f276 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | from .llm import nemotron_llama
from .embeddings import get_embeddings
from .retriever import vector_db_retriever
import pickle
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
def get_path(folder, filename):
# Try the standard 'volumes/' path (local structure)
path1 = os.path.join(BASE_DIR, "volumes", folder, filename)
if os.path.exists(path1):
return path1
# Try the root-level path (Hugging Face structure)
path2 = os.path.join(BASE_DIR, folder, filename)
if os.path.exists(path2):
return path2
# Default to path1 if neither exists, so it raises the error on the first expected path
return path1
pkl_path = get_path("metadata", "new_pdfs_corpus_data.pkl")
if not os.path.exists(pkl_path):
print(f"⚠️ ERROR: Metadata file not found at {pkl_path}")
else:
with open(pkl_path, "rb") as p:
metadata = pickle.load(p)
# ids = list(metadata.keys())
# def RAG(query, chat_history):
# query_embeddings = get_embeddings([query])
# result = vector_db_retriever(query_embeddings, 10)
# indexes = result[0][0]
# context = ""
# for idx in indexes:
# hash_id = ids[idx]
# retrieved_results = metadata[hash_id]
# context+="Title:"+retrieved_results['title']+"\n"+"Date:"+retrieved_results['date']+"\n"+"Page Number:"+str(retrieved_results['page_no'])+"\n"+"Corpus:"+retrieved_results['text']+"\n\n"
# completion = nemotron_llama(query, context, chat_history)
# # for chunk in completion:
# # if chunk.choices[0].delta.content is not None:
# # print(chunk.choices[0].delta.content, end = '')
# return completion
# RAG("explain the seventh amentment act", chat_history=[])
import re as _re
def _clean_corpus(text: str) -> str:
"""Collapse PDF extraction artifacts: newlines between words become spaces,
but preserve intentional paragraph breaks (two+ newlines)."""
# Preserve double newlines (paragraph breaks) as a placeholder
text = text.replace('\r\n', '\n').replace('\r', '\n')
# Replace single newlines (mid-sentence line-wraps from PDF) with a space
text = _re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
# Collapse multiple spaces into one
text = _re.sub(r' {2,}', ' ', text)
return text.strip()
def RAG(query, chat_history, role="General"):
query_embeddings = get_embeddings([query])
# Fetch 30 chunks from FAISS to allow filtering for diversity
result = vector_db_retriever(query_embeddings, 30)
indexes = result[0][0]
context = ""
title_counts = {}
chunks_added = 0
for idx in indexes:
# FAISS returns -1 for empty slots if there are fewer than top_k chunks
if idx == -1: continue
retrieved_results = metadata[idx]
title = retrieved_results['title']
# Enforce diversity: Max 2 chunks per PDF
if title_counts.get(title, 0) >= 2:
continue
clean_paragraph = _clean_corpus(retrieved_results['paragraphs'])
context += f"Title: {title}\nPage Number: {retrieved_results['page']}\nCorpus: {clean_paragraph}\n\n"
title_counts[title] = title_counts.get(title, 0) + 1
chunks_added += 1
# Stop once we have 10 highly diverse chunks
if chunks_added >= 10:
break
completion = nemotron_llama(query, context, chat_history, role=role)
# for chunk in completion:
# if chunk.choices[0].delta.content is not None:
# print(chunk.choices[0].delta.content, end = '')
return completion
print("imported sucessfully") |