Spaces:
Sleeping
Sleeping
| import os | |
| os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache' | |
| os.environ['HF_HOME'] = '/tmp/hf_home' | |
| os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/st_cache' | |
| from langchain_community.document_loaders import PyPDFDirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_core.documents import Document | |
| import os | |
| import shutil | |
| from PIL import Image | |
| import pytesseract | |
| # Funtion for load Documents and Save it into Vector Stores | |
| def embed_and_store(user_id: str): | |
| # Setup user directories | |
| base_dir = os.path.join("/tmp/docs", user_id) | |
| pdf_dir = os.path.join(base_dir, "pdfs") | |
| image_dir = os.path.join(base_dir,"images") | |
| faiss_dir = os.path.join(base_dir, "faiss_index") | |
| #Using Pytesseract for extracting Image texts | |
| image_texts = [] | |
| for filename in os.listdir(image_dir): | |
| if filename.lower().endswith((".png", ".jpg", ".jpeg")): | |
| image_path = os.path.join(image_dir, filename) | |
| image = Image.open(image_path) | |
| text = pytesseract.image_to_string(image) | |
| image_texts.append((filename, text)) | |
| doc_images = [Document(page_content=text, metadata={"source": fname}) for fname, text in image_texts] | |
| # Loade Pdfs using PyPDFDirectoryLoader | |
| loader = PyPDFDirectoryLoader(pdf_dir) | |
| docs = loader.load() | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=200) | |
| document = splitter.split_documents(docs) | |
| documents = document + doc_images | |
| updated_documents = [] | |
| for i, doc in enumerate(documents): | |
| meta = doc.metadata.copy() | |
| meta["doc_id"] = meta.get("source", f"doc_{i}") # Use filename or fallback | |
| meta["chunk_id"] = i | |
| # If page number available (for PDF) | |
| if "page" in meta: | |
| meta["citation"] = f"{meta['source']} - page {meta['page']}, chunk {i}" | |
| else: | |
| meta["citation"] = f"{meta['source']} - chunk {i}" | |
| updated_documents.append(Document(page_content=doc.page_content, metadata=meta)) | |
| # Load HuggingFace Embedding model | |
| embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') | |
| #Load existing FAISS index if exists | |
| if os.path.exists(os.path.join(faiss_dir, "index.faiss")): | |
| vectorstore = FAISS.load_local(faiss_dir, embeddings, allow_dangerous_deserialization=True) | |
| vectorstore.add_documents(updated_documents) | |
| else: | |
| vectorstore = FAISS.from_documents(updated_documents, embeddings) | |
| vectorstore.save_local(faiss_dir) | |
| print(f"✅ FAISS updated for user: {user_id}") | |