tech5's picture
Copied GitHub project to Hugging Face Space
66cafc7
import os
os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
os.environ['HF_HOME'] = '/tmp/hf_home'
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/st_cache'
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
import os
import shutil
from PIL import Image
import pytesseract
# Funtion for load Documents and Save it into Vector Stores
def embed_and_store(user_id: str):
# Setup user directories
base_dir = os.path.join("/tmp/docs", user_id)
pdf_dir = os.path.join(base_dir, "pdfs")
image_dir = os.path.join(base_dir,"images")
faiss_dir = os.path.join(base_dir, "faiss_index")
#Using Pytesseract for extracting Image texts
image_texts = []
for filename in os.listdir(image_dir):
if filename.lower().endswith((".png", ".jpg", ".jpeg")):
image_path = os.path.join(image_dir, filename)
image = Image.open(image_path)
text = pytesseract.image_to_string(image)
image_texts.append((filename, text))
doc_images = [Document(page_content=text, metadata={"source": fname}) for fname, text in image_texts]
# Loade Pdfs using PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader(pdf_dir)
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=200)
document = splitter.split_documents(docs)
documents = document + doc_images
updated_documents = []
for i, doc in enumerate(documents):
meta = doc.metadata.copy()
meta["doc_id"] = meta.get("source", f"doc_{i}") # Use filename or fallback
meta["chunk_id"] = i
# If page number available (for PDF)
if "page" in meta:
meta["citation"] = f"{meta['source']} - page {meta['page']}, chunk {i}"
else:
meta["citation"] = f"{meta['source']} - chunk {i}"
updated_documents.append(Document(page_content=doc.page_content, metadata=meta))
# Load HuggingFace Embedding model
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
#Load existing FAISS index if exists
if os.path.exists(os.path.join(faiss_dir, "index.faiss")):
vectorstore = FAISS.load_local(faiss_dir, embeddings, allow_dangerous_deserialization=True)
vectorstore.add_documents(updated_documents)
else:
vectorstore = FAISS.from_documents(updated_documents, embeddings)
vectorstore.save_local(faiss_dir)
print(f"✅ FAISS updated for user: {user_id}")