from langchain_community.vectorstores import Chroma from langchain_community.embeddings import SentenceTransformerEmbeddings from langchain_core.documents import Document from pypdf import PdfReader from langchain.chains import LLMChain from langchain.llms import HuggingFaceHub import os from huggingface_hub import InferenceClient from dotenv import load_dotenv import uuid API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") hf_client = InferenceClient(token=API_TOKEN) # --- LLM and EMBEDDINGS SETUP --- def create_embeddings_load_data(): embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") return embeddings def get_llm(): try: llm = HuggingFaceHub( repo_id="google/t5-small-lm-adapt", model_kwargs={"temperature": 0.1, "max_length": 200} ) return llm except Exception as e: print(f"Error loading HuggingFaceHub LLM: {e}") return None # --- PDF PROCESSING --- def get_pdf_text(pdf_doc): text = "" pdf_reader = PdfReader(pdf_doc) for page in pdf_reader.pages: text += page.extract_text() or "" return text def create_docs(user_pdf_list, unique_id): docs = [] for filename in user_pdf_list: chunks = get_pdf_text(filename) docs.append(Document( page_content=chunks, metadata={"name": filename.name, "unique_id": unique_id}, )) return docs # --- CHROMA DB (FREE LOCAL VECTOR STORE) FUNCTIONS --- VECTOR_STORES = {} def push_to_chroma(unique_id, embeddings, docs): global VECTOR_STORES vectorstore = Chroma.from_documents(docs, embeddings) VECTOR_STORES[unique_id] = vectorstore return vectorstore def pull_from_chroma(unique_id): global VECTOR_STORES return VECTOR_STORES.get(unique_id) def similar_docs(query, k, unique_id): vectorstore = pull_from_chroma(unique_id) if not vectorstore: raise ValueError("Vector store not initialized for this session.") similar_docs_with_score = vectorstore.similarity_search_with_score(query, k=int(k)) return similar_docs_with_score # --- SUMMARIZATION --- def get_summary(doc): try: text = doc.page_content if hasattr(doc, "page_content") else str(doc) result = hf_client.summarization(text) return result.summary_text if hasattr(result, "summary_text") else str(result) except Exception as e: return f"Summarization service error: {e}"