File size: 2,551 Bytes
1f16ba7
 
 
 
2fff352
1f16ba7
 
2fff352
 
 
1f16ba7
2fff352
 
1f16ba7
 
 
 
 
 
 
 
 
 
2fff352
 
1f16ba7
 
 
 
 
 
 
2fff352
1f16ba7
 
 
 
2fff352
1f16ba7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fff352
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.documents import Document
from pypdf import PdfReader
from langchain.chains import LLMChain
from langchain.llms import HuggingFaceHub
import os
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
import uuid

API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
hf_client = InferenceClient(token=API_TOKEN)
# --- LLM and EMBEDDINGS SETUP ---

def create_embeddings_load_data():
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return embeddings


def get_llm():
    try:
        llm = HuggingFaceHub(
            repo_id="google/t5-small-lm-adapt",
            model_kwargs={"temperature": 0.1, "max_length": 200}
        )
        return llm
    except Exception as e:
        print(f"Error loading HuggingFaceHub LLM: {e}")
        return None


# --- PDF PROCESSING ---
def get_pdf_text(pdf_doc):
    text = ""
    pdf_reader = PdfReader(pdf_doc)
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text


def create_docs(user_pdf_list, unique_id):
    docs = []
    for filename in user_pdf_list:
        chunks = get_pdf_text(filename)
        docs.append(Document(
            page_content=chunks,
            metadata={"name": filename.name, "unique_id": unique_id},
        ))
    return docs


# --- CHROMA DB (FREE LOCAL VECTOR STORE) FUNCTIONS ---
VECTOR_STORES = {}


def push_to_chroma(unique_id, embeddings, docs):
    global VECTOR_STORES
    vectorstore = Chroma.from_documents(docs, embeddings)
    VECTOR_STORES[unique_id] = vectorstore
    return vectorstore


def pull_from_chroma(unique_id):
    global VECTOR_STORES
    return VECTOR_STORES.get(unique_id)


def similar_docs(query, k, unique_id):
    vectorstore = pull_from_chroma(unique_id)
    if not vectorstore:
        raise ValueError("Vector store not initialized for this session.")
    similar_docs_with_score = vectorstore.similarity_search_with_score(query, k=int(k))
    return similar_docs_with_score


# --- SUMMARIZATION ---
def get_summary(doc):
    try:
        text = doc.page_content if hasattr(doc, "page_content") else str(doc)
        result = hf_client.summarization(text)
        return result.summary_text if hasattr(result, "summary_text") else str(result)
    except Exception as e:
        return f"Summarization service error: {e}"