Spaces:

Rauhan
/

PdfSummarizer

Sleeping

App Files Files Community

Rauhan commited on Apr 23, 2025

Commit

9aa78e6

1 Parent(s): f09733c

UPLOAD: files upload

Browse files

Files changed (19) hide show

.env +1 -0
.gitignore +10 -0
.python-version +1 -0
Dockerfile +13 -0
config.ini +13 -0
main.py +120 -0
prompts.yaml +10 -0
pyproject.toml +17 -0
requirements.txt +2 -0
src/__init__.py +0 -0
src/components/__init__.py +0 -0
src/components/extractPdfDetails.py +80 -0
src/components/summaryEngine.py +39 -0
src/pipelines/__init__.py +0 -0
src/pipelines/pipeline.py +31 -0
src/utils/__init__.py +0 -0
src/utils/functions.py +42 -0
src/utils/logger.py +10 -0
uv.lock +0 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ GROQ_API_KEY=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

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . /app
+RUN apt-get update && apt-get install -y poppler-utils
+RUN uv install .
+EXPOSE 7860
+CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]

config.ini ADDED Viewed

	@@ -0,0 +1,13 @@

+[GROQ CONFIG]
+BASEURL = https://api.groq.com/openai/v1
+[DETAIL EXTRACTOR]
+VLM = meta-llama/llama-4-scout-17b-16e-instruct
+BATCHSIZE = 5
+MAXTOKENS = 1024
+TEMPERATURE = 0.5
+[SUMMARIZER]
+LLM = llama-3.3-70b-versatile
+MAXTOKENS = 2048
+TEMPERATURE = 0.5

main.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from src.pipelines.pipeline import Pipeline
+from datetime import datetime
+import streamlit as st
+from io import BytesIO
+from fpdf import FPDF
+import time
+# Configure the page
+st.set_page_config(
+    page_title="PDF Summarizer",
+    page_icon=None,
+    layout="wide"
+)
+# Initialize the pipeline
+pipeline = Pipeline()
+# Custom styling
+st.markdown("""
+    <style>
+    .main-header {
+        font-size: 2.5rem;
+        color: #1E88E5;
+        margin-bottom: 2rem;
+    }
+    .summary-header {
+        font-size: 1.8rem;
+        color: #2E7D32;
+        margin: 1.5rem 0;
+    }
+    .status-container {
+        padding: 1rem;
+        border-radius: 0.5rem;
+        background-color: #f0f2f6;
+        margin: 1rem 0;
+    }
+    </style>
+""", unsafe_allow_html=True)
+# Sidebar
+with st.sidebar:
+    st.markdown("### Upload PDF")
+    uploadedFile = st.file_uploader("Choose your PDF file", type=['pdf'])
+    if uploadedFile:
+        pdfDetails = {
+            'File Name': uploadedFile.name,
+            'File Size': f"{round(len(uploadedFile.getvalue()) / 1024, 2)} KB",
+            'Upload Time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        }
+        st.markdown("### PDF Details")
+        for key, value in pdfDetails.items():
+            st.write(f"**{key}:** {value}")
+# Main content
+st.markdown("<h1 class='main-header'>PDF Summarizer</h1>", unsafe_allow_html=True)
+st.write("""
+This application generates a concise summary from your uploaded PDF document.
+Upload a file to get started.
+""")
+if uploadedFile:
+    statusContainer = st.empty()
+    summaryContainer = st.empty()
+    with statusContainer.container():
+        st.markdown("### Processing Status")
+        statusBox = st.empty()
+        try:
+            # Read PDF
+            startTime = time.time()
+            statusBox.info("Reading PDF file...")
+            pdfBytes = uploadedFile.getvalue()
+            readDuration = time.time() - startTime
+            statusBox.success(f"PDF file read successfully ({readDuration:.2f}s)")
+            # Generate summary
+            statusBox.info("Generating summary...")
+            summaryStartTime = time.time()
+            summary = pipeline.run(pdfBytes)
+            totalTime = time.time() - startTime
+            if summary:
+                statusBox.success(f"Summary generated successfully (Total time: {totalTime:.2f}s)")
+                with summaryContainer.container():
+                    st.markdown("<h2 class='summary-header'>Generated Summary</h2>", unsafe_allow_html=True)
+                    st.markdown(summary)
+                    if st.button("Download Summary as PDF"):
+                        try:
+                            pdf = FPDF()
+                            pdf.add_page()
+                            pdf.set_font("Arial", 'B', 16)
+                            pdf.cell(200, 10, txt="PDF Summary", ln=True, align='C')
+                            pdf.ln(10)
+                            pdf.set_font("Arial", size=12)
+                            pdf.multi_cell(0, 10, summary)
+                            pdfOutput = BytesIO()
+                            pdf.output(pdfOutput)
+                            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                            st.download_button(
+                                label="Click to Download",
+                                data=pdfOutput.getvalue(),
+                                file_name=f"summary_{timestamp}.pdf",
+                                mime="application/pdf"
+                            )
+                        except Exception as e:
+                            st.error(f"Error creating PDF: {str(e)}")
+            else:
+                statusBox.error("Failed to generate summary. Please try again.")
+        except Exception as e:
+            statusBox.error(f"Error processing PDF: {str(e)}")
+else:
+    st.info("Please upload a PDF file using the sidebar to get started.")

prompts.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+detailExtractorPrompt: |
+  You are a highly accurate detail extraction assistant. Your task is to extract essential and relevant details from PDF documents based solely on visible content.
+  Do not summarize, paraphrase, interpret, or assume any information. Extract only what is explicitly present and necessary—nothing more, nothing less.
+  Your output should include all significant facts, figures, names, dates, and other critical data points, while omitting redundant or irrelevant content.
+  Fidelity to the original source is absolute. No hallucination, no inference—only precise extraction of concrete details.
+summaryEnginePrompt: |
+  You are a master-level summarization synthesis assistant. Your role is to take a series of chunk-level summaries and weave them into a single, comprehensive, and highly detailed summary that captures everything—no omissions, no distortions, no hallucinations.
+  You must preserve all important factual information, nuances, and specific details from the input summaries. Your output must reflect the full scope and depth of the original content.
+  Nothing is to be added, nothing is to be assumed. Completeness, accuracy, and fidelity to the source are your highest priorities.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[project]
+name = "pdfsummarizer"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "fpdf>=1.7.2",
+    "groq>=0.23.0",
+    "litellm>=1.67.1",
+    "loguru>=0.7.3",
+    "openai>=1.75.0",
+    "pdf2image>=1.17.0",
+    "python-dotenv>=1.1.0",
+    "pyyaml>=6.0.2",
+    "streamlit>=1.44.1",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ streamlit>=1.10.0
2	+ fpdf>=1.7.2

src/__init__.py ADDED Viewed

File without changes

src/components/__init__.py ADDED Viewed

File without changes

src/components/extractPdfDetails.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from ..utils.functions import getConfig, convertImageToBase64, getYaml
+from pdf2image import convert_from_bytes
+from ..utils.logger import logger
+from dotenv import load_dotenv
+from openai import OpenAI
+from PIL import Image
+import math
+import os
+load_dotenv()
+class ExtractPdfDetails:
+    def __init__(self):
+        logger.info("INITIALIZING EXTRACT PDF DETAILS")
+        self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
+        self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
+        self.llmClient = OpenAI(
+            base_url = self.config["GROQ CONFIG"]["BASEURL"],
+            api_key = os.environ["GROQ_API_KEY"]
+        )
+    def convertToImages(self, pdfBytes: str) -> list[Image.Image]:
+        """
+        Convert a pdf to a list of images
+        Args:
+            pdfBytes: bytes of the pdf file
+        Returns:
+            list[Image.Image]: list of pdf pages as images
+        """
+        try:
+            logger.info(f"Converting pdf to images")
+            images = convert_from_bytes(pdfBytes)
+            return images
+        except Exception as e:
+            logger.error(f"Error converting pdf to images: {e}")
+            return None
+    def chunkImages(self, images: list[Image.Image]) -> list[list[Image.Image]]:
+        """
+        Chunk the images into smaller chunks
+        Args:
+            images: list of images
+        Returns:
+            chunks: list of chunks of images
+        """
+        try:
+            logger.info("Chunking the images")
+            batchSize = self.config["DETAIL EXTRACTOR"]["BATCHSIZE"]
+            nBatches = math.ceil(len(images) / batchSize)
+            chunks = [images[batchSize * x: batchSize * x + batchSize] for x in range(nBatches)]
+            return chunks
+        except Exception as e:
+            logger.error(f"Error chunking the images: {e}")
+            return None
+    def extractDetailsFromChunk(self, images: list[Image.Image]) -> str:
+        """
+        Extract details from a chunk of images
+        Args:
+            images: list of images
+        Returns:
+            details: string of details extracted from the images
+        """
+        try:
+            logger.info("Extracting details from the images")
+            completion = self.llmClient.chat.completions.create(
+                model = self.config["DETAIL EXTRACTOR"]["VLM"],
+                messages = [
+                    {"role": "system", "content": self.prompts["detailExtractorPrompt"]},
+                    {"role": "user", "content": [{"type": "image_url", "image_url": {"url": convertImageToBase64(image)}} for image in images]}
+                ],
+                temperature = self.config["DETAIL EXTRACTOR"]["TEMPERATURE"],
+                max_tokens = self.config["DETAIL EXTRACTOR"]["MAXTOKENS"],
+                stream = False
+            )
+            response = completion.choices[0].message.content
+            return response
+        except Exception as e:
+            logger.error(f"Error extracting details from the images: {e}")
+            return None

src/components/summaryEngine.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from ..utils.functions import getConfig, getYaml
+from ..utils.logger import logger
+import litellm
+import os
+class SummaryEngine:
+    def __init__(self):
+        logger.info("INITIALIZING SUMMARY ENGINE")
+        self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
+        self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
+    def summarize(self, texts: list[str]) -> str:
+        """
+        Summarize a text
+        Args:
+            texts: list of texts to summarize
+        Returns:
+            summary: summary of the texts
+        """
+        try:
+            logger.info("Summarizing the details extracted from the images")
+            allSummaries = "\n".join(texts)
+            completion = litellm.completion(
+                model = self.config["SUMMARIZER"]["LLM"],
+                api_key = os.environ["GROQ_API_KEY"],
+                api_base = self.config["GROQ CONFIG"]["BASEURL"],
+                messages = [
+                    {"role": "system", "content": self.prompts["summaryEnginePrompt"]},
+                    {"role": "user", "content": f"AGGEREGATED SUMMARIES: {allSummaries}"}
+                ],
+                max_tokens = self.config["SUMMARIZER"]["MAXTOKENS"],
+                temperature = self.config["SUMMARIZER"]["TEMPERATURE"]
+            )
+            response = completion["choices"][0]["message"]["content"]
+            logger.info("Summary generated successfully")
+            return response
+        except Exception as e:
+            logger.error(f"Error summarizing the text: {e}")
+            return None

src/pipelines/__init__.py ADDED Viewed

File without changes

src/pipelines/pipeline.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from ..components.extractPdfDetails import ExtractPdfDetails
+from ..components.summaryEngine import SummaryEngine
+from concurrent.futures import ThreadPoolExecutor
+from ..utils.logger import logger
+class Pipeline:
+    def __init__(self):
+        logger.info("INITIALIZING PIPELINE")
+        self.extractPdfDetails = ExtractPdfDetails()
+        self.summaryEngine = SummaryEngine()
+    def run(self, pdfBytes: bytes) -> str:
+        """
+        Run the pipeline
+        Args:
+            pdfBytes: bytes of the pdf file
+        Returns:
+            summary: summary of the pdf file
+        """
+        try:
+            logger.info("Running the pipeline")
+            images = self.extractPdfDetails.convertToImages(pdfBytes = pdfBytes)
+            chunks = self.extractPdfDetails.chunkImages(images = images)
+            with ThreadPoolExecutor(max_workers = 30) as executor:
+                futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
+                summaries = [future.result() for future in futures]
+            summary = self.summaryEngine.summarize(texts = summaries)
+            return summary
+        except Exception as e:
+            logger.error(f"Error running the pipeline: {e}")
+            return None

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/functions.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from io import BytesIO
+from PIL import Image
+import configparser
+import base64
+import yaml
+def getConfig(configFilePath: str) -> configparser.ConfigParser:
+    """
+    Get the config from the config file
+    Args:
+        configFilePath: path to the config file
+    Returns:
+        config: config parser object
+    """
+    config = configparser.ConfigParser()
+    config.read(configFilePath)
+    return config
+def convertImageToBase64(image: Image.Image) -> str:
+    """
+    Convert an image to a base64 string
+    Args:
+        image: image object
+    Returns:
+        imageString: base64 string of the image
+    """
+    buffered = BytesIO()
+    image.save(buffered, format = "JPEG", optimize = True, quality = 85)
+    imageString = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return imageString
+def getYaml(yamlFilePath: str) -> dict:
+    """
+    Get the yaml from the yaml file
+    Args:
+        yamlFilePath: path to the yaml file
+    Returns:
+        yamlData: yaml data
+    """
+    with open(yamlFilePath, "r") as file:
+        yamlData = yaml.safe_load(file)
+    return yamlData

src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from loguru import logger
+import sys
+import os
+logFilePath = os.path.join(os.getcwd(), "logs", "runningLogs.log")
+logFormat = "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
+logger.remove()
+logger.add(sys.stdout, colorize = True, format = logFormat)
+logger.add(logFilePath, format = logFormat, enqueue = True, mode = "w")

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff