File size: 2,728 Bytes
66cafc7
 
 
 
 
7bdc074
 
 
 
 
 
 
 
 
 
 
 
 
 
6048b8c
7bdc074
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
os.environ['HF_HOME'] = '/tmp/hf_home'
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/st_cache'

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
import os
import shutil
from PIL import Image
import pytesseract


# Funtion for load Documents and Save it into Vector Stores
def embed_and_store(user_id: str):
    # Setup user directories
    base_dir = os.path.join("/tmp/docs", user_id)
    pdf_dir = os.path.join(base_dir, "pdfs")
    image_dir = os.path.join(base_dir,"images")
    faiss_dir = os.path.join(base_dir, "faiss_index")

    #Using Pytesseract for extracting Image texts
    image_texts = []
    for filename in os.listdir(image_dir):
        if filename.lower().endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(image_dir, filename)
            image = Image.open(image_path)
            text = pytesseract.image_to_string(image)
            image_texts.append((filename, text))
    doc_images = [Document(page_content=text, metadata={"source": fname}) for fname, text in image_texts] 
    
    # Loade Pdfs using PyPDFDirectoryLoader
    loader = PyPDFDirectoryLoader(pdf_dir)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=200)
    document = splitter.split_documents(docs)
    documents = document + doc_images
    updated_documents = []
    for i, doc in enumerate(documents):
        meta = doc.metadata.copy()
    
        meta["doc_id"] = meta.get("source", f"doc_{i}")  # Use filename or fallback
        meta["chunk_id"] = i

    # If page number available (for PDF)
        if "page" in meta:
            meta["citation"] = f"{meta['source']} - page {meta['page']}, chunk {i}"
        else:
            meta["citation"] = f"{meta['source']} - chunk {i}"

        updated_documents.append(Document(page_content=doc.page_content, metadata=meta))
    # Load HuggingFace Embedding model 
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

     #Load existing FAISS index if exists
    if os.path.exists(os.path.join(faiss_dir, "index.faiss")):
        vectorstore = FAISS.load_local(faiss_dir, embeddings, allow_dangerous_deserialization=True)
        vectorstore.add_documents(updated_documents)
    else:
        vectorstore = FAISS.from_documents(updated_documents, embeddings)

    vectorstore.save_local(faiss_dir)
    print(f"✅ FAISS updated for user: {user_id}")