File size: 3,794 Bytes
a74720f
 
 
 
 
 
 
 
535ee95
 
 
 
 
 
 
 
 
 
 
a74720f
535ee95
 
 
 
 
 
 
a74720f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b82f276
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from .llm import nemotron_llama
from .embeddings import get_embeddings
from .retriever import vector_db_retriever
import pickle

import os

BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
def get_path(folder, filename):
    # Try the standard 'volumes/' path (local structure)
    path1 = os.path.join(BASE_DIR, "volumes", folder, filename)
    if os.path.exists(path1):
        return path1
    # Try the root-level path (Hugging Face structure)
    path2 = os.path.join(BASE_DIR, folder, filename)
    if os.path.exists(path2):
        return path2
    # Default to path1 if neither exists, so it raises the error on the first expected path
    return path1

pkl_path = get_path("metadata", "new_pdfs_corpus_data.pkl")

if not os.path.exists(pkl_path):
    print(f"⚠️ ERROR: Metadata file not found at {pkl_path}")
else:
    with open(pkl_path, "rb") as p:
        metadata = pickle.load(p)

# ids = list(metadata.keys())


# def RAG(query, chat_history):
#     query_embeddings = get_embeddings([query])
#     result = vector_db_retriever(query_embeddings, 10)
#     indexes = result[0][0]
#     context = ""
#     for idx in indexes:
#         hash_id = ids[idx]
#         retrieved_results = metadata[hash_id]
#         context+="Title:"+retrieved_results['title']+"\n"+"Date:"+retrieved_results['date']+"\n"+"Page Number:"+str(retrieved_results['page_no'])+"\n"+"Corpus:"+retrieved_results['text']+"\n\n"
#     completion = nemotron_llama(query, context, chat_history)
#     # for chunk in completion:
#     #     if chunk.choices[0].delta.content is not None:
#     #         print(chunk.choices[0].delta.content, end = '')
#     return completion
# RAG("explain the seventh amentment act", chat_history=[])


import re as _re

def _clean_corpus(text: str) -> str:
    """Collapse PDF extraction artifacts: newlines between words become spaces,

    but preserve intentional paragraph breaks (two+ newlines)."""
    # Preserve double newlines (paragraph breaks) as a placeholder
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    # Replace single newlines (mid-sentence line-wraps from PDF) with a space
    text = _re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Collapse multiple spaces into one
    text = _re.sub(r' {2,}', ' ', text)
    return text.strip()

def RAG(query, chat_history, role="General"):
    query_embeddings = get_embeddings([query])
    # Fetch 30 chunks from FAISS to allow filtering for diversity
    result = vector_db_retriever(query_embeddings, 30)
    indexes = result[0][0]
    
    context = ""
    title_counts = {}
    chunks_added = 0
    
    for idx in indexes:
        # FAISS returns -1 for empty slots if there are fewer than top_k chunks
        if idx == -1: continue
            
        retrieved_results = metadata[idx]
        title = retrieved_results['title']
        
        # Enforce diversity: Max 2 chunks per PDF
        if title_counts.get(title, 0) >= 2:
            continue
            
        clean_paragraph = _clean_corpus(retrieved_results['paragraphs'])
        context += f"Title: {title}\nPage Number: {retrieved_results['page']}\nCorpus: {clean_paragraph}\n\n"
        
        title_counts[title] = title_counts.get(title, 0) + 1
        chunks_added += 1
        
        # Stop once we have 10 highly diverse chunks
        if chunks_added >= 10:
            break
            
    completion = nemotron_llama(query, context, chat_history, role=role)
    # for chunk in completion:
    #     if chunk.choices[0].delta.content is not None:
    #         print(chunk.choices[0].delta.content, end = '')
    return completion

print("imported sucessfully")