Spaces:
Sleeping
Sleeping
| import re | |
| import faiss | |
| import numpy as np | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_core.documents import Document | |
| from pdfminer.high_level import extract_text | |
| from up_config import Config | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| from langchain.prompts import PromptTemplate | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_community.docstore.in_memory import InMemoryDocstore | |
| # from langchain_ollama import ChatOllama | |
| def document_description(text: str)->str: | |
| llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME) | |
| # llm = ChatOllama(model="llama3.2:latest") | |
| prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph elaborate description. | |
| Text: {text} | |
| Description:''' | |
| summary_prompt = PromptTemplate.from_template(prompt_text) | |
| chain = summary_prompt | llm | StrOutputParser() | |
| response = chain.invoke({"text": text}) | |
| return next((line for line in response.split('\n') if line.strip()), "No description could be generated.") | |
| def doc_summarizer(text: str)->str: | |
| llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME) | |
| # llm = ChatOllama(model="llama3.2:latest") | |
| prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph summary. You can remove the unessary texts. Respond with only required answers. | |
| Text: {text} | |
| Description:''' | |
| summary_prompt = PromptTemplate.from_template(prompt_text) | |
| chain = summary_prompt | llm | StrOutputParser() | |
| response = chain.invoke({"text": text}) | |
| return next((line for line in response.split('\n') if line.strip()), "No description could be generated.") | |
| def text_cleaning(text: str) -> str: | |
| new_text = re.sub(r'https?://\S+|www\.\S+', '', text.lower()) | |
| new_text = re.sub(r"\[[a-z0-9,\s]+\]", '', new_text) | |
| new_text = re.sub(r"\([a-z0-9\s]+\)", '', new_text) | |
| new_text = re.sub(r"<[a-z0-9]+>", '', new_text) | |
| new_text = re.sub(r"[a-z.]+@[a-z.]+.(...)", '', new_text) | |
| new_text = re.sub(r'[^a-z0-9.\s:\\{}_\[\]^,;\'\-+=!@$%&*()]', ' ', new_text) | |
| new_text = re.sub(r'\s+', ' ', new_text) | |
| new_text = new_text.replace('\x0b', '').replace('\x0c', '') | |
| new_text = new_text.replace('-\n', '') | |
| return new_text | |
| def text_processing(file_path: str) -> list[str]: | |
| text = extract_text(file_path) | |
| docs = text_cleaning(text) | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=Config.CHUNK_SIZE, | |
| chunk_overlap=Config.CHUNK_OVERLAP, | |
| add_start_index=True, | |
| separators=["\n\n", "\n", ".", " "], | |
| ) | |
| raw_docs = splitter.split_text(docs) | |
| processed_docs = [re.sub(r'\s+', ' ', doc).strip() for doc in raw_docs] | |
| return processed_docs | |
| def ingest_into_vector_db_hnsw(file_path: str): | |
| splited_docs = text_processing(file_path) | |
| description_text = " ".join(splited_docs) | |
| doc_desc = document_description(description_text) | |
| summ_docs = doc_summarizer(doc_desc) | |
| print("doc summ: ", summ_docs) | |
| embeddings = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL) | |
| vectors = embeddings.embed_documents(splited_docs) | |
| d = len(vectors[0]) | |
| M = 32 | |
| index = faiss.IndexHNSWFlat(d, M) | |
| index.hnsw.efConstruction = 200 | |
| index.hnsw.efSearch = 100 | |
| index.add(np.array(vectors, dtype="float32")) | |
| docs = [Document(page_content=doc) for doc in splited_docs] | |
| docstore = InMemoryDocstore({i: doc for i, doc in enumerate(docs)}) | |
| index_to_docstore_id = {i: i for i in range(len(docs))} | |
| vectorstore = FAISS( | |
| embedding_function=embeddings, | |
| index=index, | |
| docstore=docstore, | |
| index_to_docstore_id=index_to_docstore_id | |
| ) | |
| retriever = vectorstore.as_retriever( | |
| search_type="similarity", | |
| search_kwargs={ | |
| 'k': Config.RETRIEVER_K, | |
| # 'score_threshold': 0.4 | |
| } | |
| ) | |
| return retriever, summ_docs |