Spaces:
Sleeping
Sleeping
| import fitz | |
| import io | |
| import base64 | |
| import numpy as np | |
| from PIL import Image | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from embedding_functions import embed_text, embed_image | |
| def process_uploaded_pdf(pdf_bytes): | |
| doc = fitz.open(stream = pdf_bytes, filetype = "pdf") | |
| all_docs = [] | |
| all_embeddings = [] | |
| image_data_store = {} | |
| splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100) | |
| for i,page in enumerate(doc): | |
| # TEXT PROCESSING | |
| text = page.get_text() | |
| if text.strip(): | |
| temp_doc = Document(page_content = text, metadata = {"page": i, "type": "text"}) | |
| text_chunks = splitter.split_documents([temp_doc]) | |
| for chunk in text_chunks: | |
| embedding = embed_text(chunk.page_content) | |
| all_embeddings.append(embedding) | |
| all_docs.append(chunk) | |
| # IMAGE PROCESSING | |
| for img_index, img in enumerate(page.get_images(full = True)): | |
| try: | |
| xref = img[0] | |
| base_image = doc.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| # Convert to PIL Image | |
| pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| # Create unique identifier | |
| image_id = f"page_{i}_img_{img_index}" | |
| buffered = io.BytesIO() | |
| pil_image.save(buffered, format = "PNG") | |
| img_base64 = base64.b64encode(buffered.getvalue()).decode() | |
| image_data_store[image_id] = img_base64 | |
| # Embed image using CLIP | |
| embedding = embed_image(pil_image) | |
| all_embeddings.append(embedding) | |
| # Create document for image | |
| image_doc = Document( | |
| page_content = f"[Image: {image_id}]", | |
| metadata = {"page": i, "type": "image", "image_id": image_id} | |
| ) | |
| all_docs.append(image_doc) | |
| except Exception as e: | |
| print(f"Error processing image {img_index} on page {i}: {e}") | |
| continue | |
| doc.close() | |
| # BUILD FAISS DATABASE | |
| embeddings_array = np.array(all_embeddings) | |
| vector_store = FAISS.from_embeddings( | |
| text_embeddings = [(d.page_content, emb) for d, emb in zip(all_docs, embeddings_array)], | |
| embedding = None, | |
| metadatas = [d.metadata for d in all_docs] | |
| ) | |
| return vector_store, image_data_store | |