Socrates / up_database.py
Shubham578's picture
Update up_database.py
a8da121 verified
import re
import faiss
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from pdfminer.high_level import extract_text
from up_config import Config
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.docstore.in_memory import InMemoryDocstore
# from langchain_ollama import ChatOllama
def document_description(text: str)->str:
llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
# llm = ChatOllama(model="llama3.2:latest")
prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph elaborate description.
Text: {text}
Description:'''
summary_prompt = PromptTemplate.from_template(prompt_text)
chain = summary_prompt | llm | StrOutputParser()
response = chain.invoke({"text": text})
return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")
def doc_summarizer(text: str)->str:
llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
# llm = ChatOllama(model="llama3.2:latest")
prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph summary. You can remove the unessary texts. Respond with only required answers.
Text: {text}
Description:'''
summary_prompt = PromptTemplate.from_template(prompt_text)
chain = summary_prompt | llm | StrOutputParser()
response = chain.invoke({"text": text})
return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")
def text_cleaning(text: str) -> str:
new_text = re.sub(r'https?://\S+|www\.\S+', '', text.lower())
new_text = re.sub(r"\[[a-z0-9,\s]+\]", '', new_text)
new_text = re.sub(r"\([a-z0-9\s]+\)", '', new_text)
new_text = re.sub(r"<[a-z0-9]+>", '', new_text)
new_text = re.sub(r"[a-z.]+@[a-z.]+.(...)", '', new_text)
new_text = re.sub(r'[^a-z0-9.\s:\\{}_\[\]^,;\'\-+=!@$%&*()]', ' ', new_text)
new_text = re.sub(r'\s+', ' ', new_text)
new_text = new_text.replace('\x0b', '').replace('\x0c', '')
new_text = new_text.replace('-\n', '')
return new_text
def text_processing(file_path: str) -> list[str]:
text = extract_text(file_path)
docs = text_cleaning(text)
splitter = RecursiveCharacterTextSplitter(
chunk_size=Config.CHUNK_SIZE,
chunk_overlap=Config.CHUNK_OVERLAP,
add_start_index=True,
separators=["\n\n", "\n", ".", " "],
)
raw_docs = splitter.split_text(docs)
processed_docs = [re.sub(r'\s+', ' ', doc).strip() for doc in raw_docs]
return processed_docs
def ingest_into_vector_db_hnsw(file_path: str):
splited_docs = text_processing(file_path)
description_text = " ".join(splited_docs)
doc_desc = document_description(description_text)
summ_docs = doc_summarizer(doc_desc)
print("doc summ: ", summ_docs)
embeddings = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL)
vectors = embeddings.embed_documents(splited_docs)
d = len(vectors[0])
M = 32
index = faiss.IndexHNSWFlat(d, M)
index.hnsw.efConstruction = 200
index.hnsw.efSearch = 100
index.add(np.array(vectors, dtype="float32"))
docs = [Document(page_content=doc) for doc in splited_docs]
docstore = InMemoryDocstore({i: doc for i, doc in enumerate(docs)})
index_to_docstore_id = {i: i for i in range(len(docs))}
vectorstore = FAISS(
embedding_function=embeddings,
index=index,
docstore=docstore,
index_to_docstore_id=index_to_docstore_id
)
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={
'k': Config.RETRIEVER_K,
# 'score_threshold': 0.4
}
)
return retriever, summ_docs