import re import faiss import numpy as np from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain_core.documents import Document from pdfminer.high_level import extract_text from up_config import Config from langchain_google_genai import ChatGoogleGenerativeAI from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_community.docstore.in_memory import InMemoryDocstore # from langchain_ollama import ChatOllama def document_description(text: str)->str: llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME) # llm = ChatOllama(model="llama3.2:latest") prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph elaborate description. Text: {text} Description:''' summary_prompt = PromptTemplate.from_template(prompt_text) chain = summary_prompt | llm | StrOutputParser() response = chain.invoke({"text": text}) return next((line for line in response.split('\n') if line.strip()), "No description could be generated.") def doc_summarizer(text: str)->str: llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME) # llm = ChatOllama(model="llama3.2:latest") prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph summary. You can remove the unessary texts. Respond with only required answers. Text: {text} Description:''' summary_prompt = PromptTemplate.from_template(prompt_text) chain = summary_prompt | llm | StrOutputParser() response = chain.invoke({"text": text}) return next((line for line in response.split('\n') if line.strip()), "No description could be generated.") def text_cleaning(text: str) -> str: new_text = re.sub(r'https?://\S+|www\.\S+', '', text.lower()) new_text = re.sub(r"\[[a-z0-9,\s]+\]", '', new_text) new_text = re.sub(r"\([a-z0-9\s]+\)", '', new_text) new_text = re.sub(r"<[a-z0-9]+>", '', new_text) new_text = re.sub(r"[a-z.]+@[a-z.]+.(...)", '', new_text) new_text = re.sub(r'[^a-z0-9.\s:\\{}_\[\]^,;\'\-+=!@$%&*()]', ' ', new_text) new_text = re.sub(r'\s+', ' ', new_text) new_text = new_text.replace('\x0b', '').replace('\x0c', '') new_text = new_text.replace('-\n', '') return new_text def text_processing(file_path: str) -> list[str]: text = extract_text(file_path) docs = text_cleaning(text) splitter = RecursiveCharacterTextSplitter( chunk_size=Config.CHUNK_SIZE, chunk_overlap=Config.CHUNK_OVERLAP, add_start_index=True, separators=["\n\n", "\n", ".", " "], ) raw_docs = splitter.split_text(docs) processed_docs = [re.sub(r'\s+', ' ', doc).strip() for doc in raw_docs] return processed_docs def ingest_into_vector_db_hnsw(file_path: str): splited_docs = text_processing(file_path) description_text = " ".join(splited_docs) doc_desc = document_description(description_text) summ_docs = doc_summarizer(doc_desc) print("doc summ: ", summ_docs) embeddings = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL) vectors = embeddings.embed_documents(splited_docs) d = len(vectors[0]) M = 32 index = faiss.IndexHNSWFlat(d, M) index.hnsw.efConstruction = 200 index.hnsw.efSearch = 100 index.add(np.array(vectors, dtype="float32")) docs = [Document(page_content=doc) for doc in splited_docs] docstore = InMemoryDocstore({i: doc for i, doc in enumerate(docs)}) index_to_docstore_id = {i: i for i in range(len(docs))} vectorstore = FAISS( embedding_function=embeddings, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id ) retriever = vectorstore.as_retriever( search_type="similarity", search_kwargs={ 'k': Config.RETRIEVER_K, # 'score_threshold': 0.4 } ) return retriever, summ_docs