docu-backend / rag /t.py
tech5's picture
Deploy FastAPI RAG backend
e27c97c
from langchain_core.documents import Document
from collections import defaultdict
import re
import pdfplumber
import fitz # PyMuPDF
import camelot
import pytesseract
from PIL import Image
import io
# -------------------------------
# STEP 1: EXTRACT RAW CONTENT
# -------------------------------
def raw_document_text(pdf_path: str):
documents = []
with pdfplumber.open(pdf_path) as pdf:
doc_fitz = fitz.open(pdf_path)
for page_index, page in enumerate(pdf.pages, start=1):
# -------- TEXT --------
text = page.extract_text()
if text:
documents.append({
"content": text,
"metadata": {
"page": page_index,
"type": "text"
}
})
# -------- TABLES --------
try:
tables = camelot.read_pdf(
pdf_path,
pages=str(page_index),
flavor="stream"
)
for t_idx, table in enumerate(tables):
table_text = table.df.to_string(index=False)
documents.append({
"content": table_text,
"metadata": {
"page": page_index,
"type": "table",
"ref": f"Table {t_idx + 1}"
}
})
except Exception:
pass
# -------- IMAGES + OCR --------
page_fitz = doc_fitz[page_index - 1]
images = page_fitz.get_images(full=True)
for img_idx, img in enumerate(images):
xref = img[0]
base_image = doc_fitz.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
ocr_text = pytesseract.image_to_string(image)
if ocr_text.strip():
documents.append({
"content": ocr_text,
"metadata": {
"page": page_index,
"type": "image",
"ref": f"Image {img_idx + 1}"
}
})
return documents
# -------------------------------
# STEP 2: RAW → LANGCHAIN DOCS
# -------------------------------
def to_langchain_documents(raw_docs):
lc_docs = []
for doc in raw_docs:
lc_docs.append(
Document(
page_content=doc["content"],
metadata=doc["metadata"]
)
)
return lc_docs
# -------------------------------
# STEP 3: BUILD INVERTED INDEX
# -------------------------------
def build_inverted_index(lc_docs):
index = defaultdict(set)
for doc_id, doc in enumerate(lc_docs):
words = re.findall(r"\b\w+\b", doc.page_content.lower())
for word in words:
index[word].add(doc_id)
return index
# -------------------------------
# STEP 4: RUN PIPELINE
# -------------------------------
if __name__ == "__main__":
pdf_path = "Report.pdf" # <-- change path
raw_docs = raw_document_text(pdf_path)
lc_docs = to_langchain_documents(raw_docs)
index = build_inverted_index(lc_docs)
print(f"Total LangChain Documents: {len(lc_docs)}")
print(f"Total Indexed Words: {len(index)}")
# Preview index
print(dict(list(index.items())[:20]))