Spaces:
Sleeping
Sleeping
File size: 4,298 Bytes
b10fa48 b4d1996 b10fa48 a8da121 b10fa48 a8da121 b10fa48 a8da121 b10fa48 63681ee c805995 b4d1996 c805995 b10fa48 c805995 b10fa48 b4d1996 63681ee b4d1996 c805995 b10fa48 b4d1996 b10fa48 b4d1996 b10fa48 b4d1996 b10fa48 b4d1996 b10fa48 b4d1996 b10fa48 b4d1996 b10fa48 63681ee b10fa48 b4d1996 c805995 b10fa48 b4d1996 b10fa48 c805995 b10fa48 b4d1996 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import re
import faiss
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from pdfminer.high_level import extract_text
from up_config import Config
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.docstore.in_memory import InMemoryDocstore
# from langchain_ollama import ChatOllama
def document_description(text: str)->str:
llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
# llm = ChatOllama(model="llama3.2:latest")
prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph elaborate description.
Text: {text}
Description:'''
summary_prompt = PromptTemplate.from_template(prompt_text)
chain = summary_prompt | llm | StrOutputParser()
response = chain.invoke({"text": text})
return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")
def doc_summarizer(text: str)->str:
llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
# llm = ChatOllama(model="llama3.2:latest")
prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph summary. You can remove the unessary texts. Respond with only required answers.
Text: {text}
Description:'''
summary_prompt = PromptTemplate.from_template(prompt_text)
chain = summary_prompt | llm | StrOutputParser()
response = chain.invoke({"text": text})
return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")
def text_cleaning(text: str) -> str:
new_text = re.sub(r'https?://\S+|www\.\S+', '', text.lower())
new_text = re.sub(r"\[[a-z0-9,\s]+\]", '', new_text)
new_text = re.sub(r"\([a-z0-9\s]+\)", '', new_text)
new_text = re.sub(r"<[a-z0-9]+>", '', new_text)
new_text = re.sub(r"[a-z.]+@[a-z.]+.(...)", '', new_text)
new_text = re.sub(r'[^a-z0-9.\s:\\{}_\[\]^,;\'\-+=!@$%&*()]', ' ', new_text)
new_text = re.sub(r'\s+', ' ', new_text)
new_text = new_text.replace('\x0b', '').replace('\x0c', '')
new_text = new_text.replace('-\n', '')
return new_text
def text_processing(file_path: str) -> list[str]:
text = extract_text(file_path)
docs = text_cleaning(text)
splitter = RecursiveCharacterTextSplitter(
chunk_size=Config.CHUNK_SIZE,
chunk_overlap=Config.CHUNK_OVERLAP,
add_start_index=True,
separators=["\n\n", "\n", ".", " "],
)
raw_docs = splitter.split_text(docs)
processed_docs = [re.sub(r'\s+', ' ', doc).strip() for doc in raw_docs]
return processed_docs
def ingest_into_vector_db_hnsw(file_path: str):
splited_docs = text_processing(file_path)
description_text = " ".join(splited_docs)
doc_desc = document_description(description_text)
summ_docs = doc_summarizer(doc_desc)
print("doc summ: ", summ_docs)
embeddings = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL)
vectors = embeddings.embed_documents(splited_docs)
d = len(vectors[0])
M = 32
index = faiss.IndexHNSWFlat(d, M)
index.hnsw.efConstruction = 200
index.hnsw.efSearch = 100
index.add(np.array(vectors, dtype="float32"))
docs = [Document(page_content=doc) for doc in splited_docs]
docstore = InMemoryDocstore({i: doc for i, doc in enumerate(docs)})
index_to_docstore_id = {i: i for i in range(len(docs))}
vectorstore = FAISS(
embedding_function=embeddings,
index=index,
docstore=docstore,
index_to_docstore_id=index_to_docstore_id
)
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={
'k': Config.RETRIEVER_K,
# 'score_threshold': 0.4
}
)
return retriever, summ_docs |