Spaces:

Shubham578
/

Socrates

Sleeping

App Files Files Community

Socrates / up_database.py

Shubham578

Update up_database.py

a8da121 verified 7 days ago

raw

history blame contribute delete

4.3 kB

	import re
	import faiss
	import numpy as np
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_core.documents import Document
	from pdfminer.high_level import extract_text
	from up_config import Config
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain.prompts import PromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from langchain_community.docstore.in_memory import InMemoryDocstore
	# from langchain_ollama import ChatOllama

	def document_description(text: str)->str:

	llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
	# llm = ChatOllama(model="llama3.2:latest")

	prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph elaborate description.
	Text: {text}
	Description:'''
	summary_prompt = PromptTemplate.from_template(prompt_text)
	chain = summary_prompt \| llm \| StrOutputParser()
	response = chain.invoke({"text": text})
	return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")

	def doc_summarizer(text: str)->str:

	llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
	# llm = ChatOllama(model="llama3.2:latest")

	prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph summary. You can remove the unessary texts. Respond with only required answers.
	Text: {text}
	Description:'''
	summary_prompt = PromptTemplate.from_template(prompt_text)
	chain = summary_prompt \| llm \| StrOutputParser()
	response = chain.invoke({"text": text})
	return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")


	def text_cleaning(text: str) -> str:

	new_text = re.sub(r'https?://\S+\|www\.\S+', '', text.lower())
	new_text = re.sub(r"\[[a-z0-9,\s]+\]", '', new_text)
	new_text = re.sub(r"$[a-z0-9\s]+$", '', new_text)
	new_text = re.sub(r"<[a-z0-9]+>", '', new_text)
	new_text = re.sub(r"[a-z.]+@[a-z.]+.(...)", '', new_text)
	new_text = re.sub(r'[^a-z0-9.\s:\\{}_\[\]^,;\'\-+=!@$%&*()]', ' ', new_text)
	new_text = re.sub(r'\s+', ' ', new_text)
	new_text = new_text.replace('\x0b', '').replace('\x0c', '')
	new_text = new_text.replace('-\n', '')

	return new_text

	def text_processing(file_path: str) -> list[str]:

	text = extract_text(file_path)
	docs = text_cleaning(text)
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=Config.CHUNK_SIZE,
	chunk_overlap=Config.CHUNK_OVERLAP,
	add_start_index=True,
	separators=["\n\n", "\n", ".", " "],
	)
	raw_docs = splitter.split_text(docs)
	processed_docs = [re.sub(r'\s+', ' ', doc).strip() for doc in raw_docs]
	return processed_docs

	def ingest_into_vector_db_hnsw(file_path: str):

	splited_docs = text_processing(file_path)

	description_text = " ".join(splited_docs)
	doc_desc = document_description(description_text)

	summ_docs = doc_summarizer(doc_desc)
	print("doc summ: ", summ_docs)
	embeddings = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL)
	vectors = embeddings.embed_documents(splited_docs)

	d = len(vectors[0])
	M = 32
	index = faiss.IndexHNSWFlat(d, M)
	index.hnsw.efConstruction = 200
	index.hnsw.efSearch = 100
	index.add(np.array(vectors, dtype="float32"))

	docs = [Document(page_content=doc) for doc in splited_docs]
	docstore = InMemoryDocstore({i: doc for i, doc in enumerate(docs)})
	index_to_docstore_id = {i: i for i in range(len(docs))}

	vectorstore = FAISS(
	embedding_function=embeddings,
	index=index,
	docstore=docstore,
	index_to_docstore_id=index_to_docstore_id
	)

	retriever = vectorstore.as_retriever(
	search_type="similarity",
	search_kwargs={
	'k': Config.RETRIEVER_K,
	# 'score_threshold': 0.4
	}
	)

	return retriever, summ_docs