File size: 4,298 Bytes
b10fa48
b4d1996
b10fa48
a8da121
b10fa48
 
 
 
 
 
a8da121
b10fa48
 
a8da121
b10fa48
 
63681ee
c805995
b4d1996
 
 
c805995
b10fa48
c805995
b10fa48
 
b4d1996
 
 
63681ee
b4d1996
 
 
 
 
 
 
 
 
c805995
b10fa48
b4d1996
b10fa48
b4d1996
 
 
 
 
b10fa48
b4d1996
 
b10fa48
 
b4d1996
b10fa48
 
 
b4d1996
b10fa48
 
 
 
 
 
b4d1996
b10fa48
 
 
 
 
 
63681ee
b10fa48
 
b4d1996
c805995
b10fa48
b4d1996
 
b10fa48
 
 
 
 
 
 
c805995
b10fa48
 
 
 
 
 
 
 
 
 
 
 
 
b4d1996
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
import faiss
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from pdfminer.high_level import extract_text
from up_config import Config
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.docstore.in_memory import InMemoryDocstore
# from langchain_ollama import ChatOllama

def document_description(text: str)->str:

    llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
    # llm = ChatOllama(model="llama3.2:latest")

    prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph elaborate description.
Text: {text}
Description:'''
    summary_prompt = PromptTemplate.from_template(prompt_text)
    chain = summary_prompt | llm | StrOutputParser()
    response = chain.invoke({"text": text})
    return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")

def doc_summarizer(text: str)->str:

    llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
    # llm = ChatOllama(model="llama3.2:latest")
    
    prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph summary. You can remove the unessary texts. Respond with only required answers.
Text: {text}
Description:'''
    summary_prompt = PromptTemplate.from_template(prompt_text)
    chain = summary_prompt | llm | StrOutputParser()
    response = chain.invoke({"text": text})
    return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")


def text_cleaning(text: str) -> str:

    new_text = re.sub(r'https?://\S+|www\.\S+', '', text.lower())
    new_text = re.sub(r"\[[a-z0-9,\s]+\]", '', new_text)
    new_text = re.sub(r"\([a-z0-9\s]+\)", '', new_text)
    new_text = re.sub(r"<[a-z0-9]+>", '', new_text)
    new_text = re.sub(r"[a-z.]+@[a-z.]+.(...)", '', new_text)
    new_text = re.sub(r'[^a-z0-9.\s:\\{}_\[\]^,;\'\-+=!@$%&*()]', ' ', new_text)
    new_text = re.sub(r'\s+', ' ', new_text)
    new_text = new_text.replace('\x0b', '').replace('\x0c', '')
    new_text = new_text.replace('-\n', '')

    return new_text

def text_processing(file_path: str) -> list[str]:

    text = extract_text(file_path)
    docs = text_cleaning(text)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=Config.CHUNK_SIZE,
        chunk_overlap=Config.CHUNK_OVERLAP,
        add_start_index=True,
        separators=["\n\n", "\n", ".", " "],
    )
    raw_docs = splitter.split_text(docs)
    processed_docs = [re.sub(r'\s+', ' ', doc).strip() for doc in raw_docs]
    return processed_docs

def ingest_into_vector_db_hnsw(file_path: str):

    splited_docs = text_processing(file_path)

    description_text = " ".join(splited_docs)
    doc_desc = document_description(description_text)

    summ_docs = doc_summarizer(doc_desc) 
    print("doc summ: ", summ_docs)
    embeddings = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL)
    vectors = embeddings.embed_documents(splited_docs)

    d = len(vectors[0])
    M = 32
    index = faiss.IndexHNSWFlat(d, M)
    index.hnsw.efConstruction = 200
    index.hnsw.efSearch = 100
    index.add(np.array(vectors, dtype="float32"))

    docs = [Document(page_content=doc) for doc in splited_docs]
    docstore = InMemoryDocstore({i: doc for i, doc in enumerate(docs)})
    index_to_docstore_id = {i: i for i in range(len(docs))}

    vectorstore = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id
    )

    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={
            'k': Config.RETRIEVER_K,
            # 'score_threshold': 0.4
        }
    ) 

    return retriever, summ_docs