Spaces:
Running
Running
| from langchain_core.documents import Document | |
| from collections import defaultdict | |
| import re | |
| import pdfplumber | |
| import fitz # PyMuPDF | |
| import camelot | |
| import pytesseract | |
| from PIL import Image | |
| import io | |
| # ------------------------------- | |
| # STEP 1: EXTRACT RAW CONTENT | |
| # ------------------------------- | |
| def raw_document_text(pdf_path: str): | |
| documents = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| doc_fitz = fitz.open(pdf_path) | |
| for page_index, page in enumerate(pdf.pages, start=1): | |
| # -------- TEXT -------- | |
| text = page.extract_text() | |
| if text: | |
| documents.append({ | |
| "content": text, | |
| "metadata": { | |
| "page": page_index, | |
| "type": "text" | |
| } | |
| }) | |
| # -------- TABLES -------- | |
| try: | |
| tables = camelot.read_pdf( | |
| pdf_path, | |
| pages=str(page_index), | |
| flavor="stream" | |
| ) | |
| for t_idx, table in enumerate(tables): | |
| table_text = table.df.to_string(index=False) | |
| documents.append({ | |
| "content": table_text, | |
| "metadata": { | |
| "page": page_index, | |
| "type": "table", | |
| "ref": f"Table {t_idx + 1}" | |
| } | |
| }) | |
| except Exception: | |
| pass | |
| # -------- IMAGES + OCR -------- | |
| page_fitz = doc_fitz[page_index - 1] | |
| images = page_fitz.get_images(full=True) | |
| for img_idx, img in enumerate(images): | |
| xref = img[0] | |
| base_image = doc_fitz.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| ocr_text = pytesseract.image_to_string(image) | |
| if ocr_text.strip(): | |
| documents.append({ | |
| "content": ocr_text, | |
| "metadata": { | |
| "page": page_index, | |
| "type": "image", | |
| "ref": f"Image {img_idx + 1}" | |
| } | |
| }) | |
| return documents | |
| # ------------------------------- | |
| # STEP 2: RAW → LANGCHAIN DOCS | |
| # ------------------------------- | |
| def to_langchain_documents(raw_docs): | |
| lc_docs = [] | |
| for doc in raw_docs: | |
| lc_docs.append( | |
| Document( | |
| page_content=doc["content"], | |
| metadata=doc["metadata"] | |
| ) | |
| ) | |
| return lc_docs | |
| # ------------------------------- | |
| # STEP 3: BUILD INVERTED INDEX | |
| # ------------------------------- | |
| def build_inverted_index(lc_docs): | |
| index = defaultdict(set) | |
| for doc_id, doc in enumerate(lc_docs): | |
| words = re.findall(r"\b\w+\b", doc.page_content.lower()) | |
| for word in words: | |
| index[word].add(doc_id) | |
| return index | |
| # ------------------------------- | |
| # STEP 4: RUN PIPELINE | |
| # ------------------------------- | |
| if __name__ == "__main__": | |
| pdf_path = "Report.pdf" # <-- change path | |
| raw_docs = raw_document_text(pdf_path) | |
| lc_docs = to_langchain_documents(raw_docs) | |
| index = build_inverted_index(lc_docs) | |
| print(f"Total LangChain Documents: {len(lc_docs)}") | |
| print(f"Total Indexed Words: {len(index)}") | |
| # Preview index | |
| print(dict(list(index.items())[:20])) | |