Rauhan commited on
Commit
9aa78e6
·
1 Parent(s): f09733c

UPLOAD: files upload

Browse files
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GROQ_API_KEY=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
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . /app
6
+
7
+ RUN apt-get update && apt-get install -y poppler-utils
8
+
9
+ RUN uv install .
10
+
11
+ EXPOSE 7860
12
+
13
+ CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]
config.ini ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [GROQ CONFIG]
2
+ BASEURL = https://api.groq.com/openai/v1
3
+
4
+ [DETAIL EXTRACTOR]
5
+ VLM = meta-llama/llama-4-scout-17b-16e-instruct
6
+ BATCHSIZE = 5
7
+ MAXTOKENS = 1024
8
+ TEMPERATURE = 0.5
9
+
10
+ [SUMMARIZER]
11
+ LLM = llama-3.3-70b-versatile
12
+ MAXTOKENS = 2048
13
+ TEMPERATURE = 0.5
main.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.pipelines.pipeline import Pipeline
2
+ from datetime import datetime
3
+ import streamlit as st
4
+ from io import BytesIO
5
+ from fpdf import FPDF
6
+ import time
7
+
8
+ # Configure the page
9
+ st.set_page_config(
10
+ page_title="PDF Summarizer",
11
+ page_icon=None,
12
+ layout="wide"
13
+ )
14
+
15
+ # Initialize the pipeline
16
+ pipeline = Pipeline()
17
+
18
+ # Custom styling
19
+ st.markdown("""
20
+ <style>
21
+ .main-header {
22
+ font-size: 2.5rem;
23
+ color: #1E88E5;
24
+ margin-bottom: 2rem;
25
+ }
26
+ .summary-header {
27
+ font-size: 1.8rem;
28
+ color: #2E7D32;
29
+ margin: 1.5rem 0;
30
+ }
31
+ .status-container {
32
+ padding: 1rem;
33
+ border-radius: 0.5rem;
34
+ background-color: #f0f2f6;
35
+ margin: 1rem 0;
36
+ }
37
+ </style>
38
+ """, unsafe_allow_html=True)
39
+
40
+ # Sidebar
41
+ with st.sidebar:
42
+ st.markdown("### Upload PDF")
43
+ uploadedFile = st.file_uploader("Choose your PDF file", type=['pdf'])
44
+
45
+ if uploadedFile:
46
+ pdfDetails = {
47
+ 'File Name': uploadedFile.name,
48
+ 'File Size': f"{round(len(uploadedFile.getvalue()) / 1024, 2)} KB",
49
+ 'Upload Time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
50
+ }
51
+
52
+ st.markdown("### PDF Details")
53
+ for key, value in pdfDetails.items():
54
+ st.write(f"**{key}:** {value}")
55
+
56
+ # Main content
57
+ st.markdown("<h1 class='main-header'>PDF Summarizer</h1>", unsafe_allow_html=True)
58
+ st.write("""
59
+ This application generates a concise summary from your uploaded PDF document.
60
+ Upload a file to get started.
61
+ """)
62
+
63
+ if uploadedFile:
64
+ statusContainer = st.empty()
65
+ summaryContainer = st.empty()
66
+
67
+ with statusContainer.container():
68
+ st.markdown("### Processing Status")
69
+ statusBox = st.empty()
70
+
71
+ try:
72
+ # Read PDF
73
+ startTime = time.time()
74
+ statusBox.info("Reading PDF file...")
75
+ pdfBytes = uploadedFile.getvalue()
76
+ readDuration = time.time() - startTime
77
+ statusBox.success(f"PDF file read successfully ({readDuration:.2f}s)")
78
+
79
+ # Generate summary
80
+ statusBox.info("Generating summary...")
81
+ summaryStartTime = time.time()
82
+ summary = pipeline.run(pdfBytes)
83
+ totalTime = time.time() - startTime
84
+
85
+ if summary:
86
+ statusBox.success(f"Summary generated successfully (Total time: {totalTime:.2f}s)")
87
+
88
+ with summaryContainer.container():
89
+ st.markdown("<h2 class='summary-header'>Generated Summary</h2>", unsafe_allow_html=True)
90
+ st.markdown(summary)
91
+
92
+ if st.button("Download Summary as PDF"):
93
+ try:
94
+ pdf = FPDF()
95
+ pdf.add_page()
96
+ pdf.set_font("Arial", 'B', 16)
97
+ pdf.cell(200, 10, txt="PDF Summary", ln=True, align='C')
98
+ pdf.ln(10)
99
+ pdf.set_font("Arial", size=12)
100
+ pdf.multi_cell(0, 10, summary)
101
+
102
+ pdfOutput = BytesIO()
103
+ pdf.output(pdfOutput)
104
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
105
+
106
+ st.download_button(
107
+ label="Click to Download",
108
+ data=pdfOutput.getvalue(),
109
+ file_name=f"summary_{timestamp}.pdf",
110
+ mime="application/pdf"
111
+ )
112
+ except Exception as e:
113
+ st.error(f"Error creating PDF: {str(e)}")
114
+ else:
115
+ statusBox.error("Failed to generate summary. Please try again.")
116
+
117
+ except Exception as e:
118
+ statusBox.error(f"Error processing PDF: {str(e)}")
119
+ else:
120
+ st.info("Please upload a PDF file using the sidebar to get started.")
prompts.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ detailExtractorPrompt: |
2
+ You are a highly accurate detail extraction assistant. Your task is to extract essential and relevant details from PDF documents based solely on visible content.
3
+ Do not summarize, paraphrase, interpret, or assume any information. Extract only what is explicitly present and necessary—nothing more, nothing less.
4
+ Your output should include all significant facts, figures, names, dates, and other critical data points, while omitting redundant or irrelevant content.
5
+ Fidelity to the original source is absolute. No hallucination, no inference—only precise extraction of concrete details.
6
+
7
+ summaryEnginePrompt: |
8
+ You are a master-level summarization synthesis assistant. Your role is to take a series of chunk-level summaries and weave them into a single, comprehensive, and highly detailed summary that captures everything—no omissions, no distortions, no hallucinations.
9
+ You must preserve all important factual information, nuances, and specific details from the input summaries. Your output must reflect the full scope and depth of the original content.
10
+ Nothing is to be added, nothing is to be assumed. Completeness, accuracy, and fidelity to the source are your highest priorities.
pyproject.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "pdfsummarizer"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "fpdf>=1.7.2",
9
+ "groq>=0.23.0",
10
+ "litellm>=1.67.1",
11
+ "loguru>=0.7.3",
12
+ "openai>=1.75.0",
13
+ "pdf2image>=1.17.0",
14
+ "python-dotenv>=1.1.0",
15
+ "pyyaml>=6.0.2",
16
+ "streamlit>=1.44.1",
17
+ ]
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit>=1.10.0
2
+ fpdf>=1.7.2
src/__init__.py ADDED
File without changes
src/components/__init__.py ADDED
File without changes
src/components/extractPdfDetails.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..utils.functions import getConfig, convertImageToBase64, getYaml
2
+ from pdf2image import convert_from_bytes
3
+ from ..utils.logger import logger
4
+ from dotenv import load_dotenv
5
+ from openai import OpenAI
6
+ from PIL import Image
7
+ import math
8
+ import os
9
+
10
+ load_dotenv()
11
+
12
+ class ExtractPdfDetails:
13
+ def __init__(self):
14
+ logger.info("INITIALIZING EXTRACT PDF DETAILS")
15
+ self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
16
+ self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
17
+ self.llmClient = OpenAI(
18
+ base_url = self.config["GROQ CONFIG"]["BASEURL"],
19
+ api_key = os.environ["GROQ_API_KEY"]
20
+ )
21
+
22
+ def convertToImages(self, pdfBytes: str) -> list[Image.Image]:
23
+ """
24
+ Convert a pdf to a list of images
25
+ Args:
26
+ pdfBytes: bytes of the pdf file
27
+ Returns:
28
+ list[Image.Image]: list of pdf pages as images
29
+ """
30
+ try:
31
+ logger.info(f"Converting pdf to images")
32
+ images = convert_from_bytes(pdfBytes)
33
+ return images
34
+ except Exception as e:
35
+ logger.error(f"Error converting pdf to images: {e}")
36
+ return None
37
+
38
+ def chunkImages(self, images: list[Image.Image]) -> list[list[Image.Image]]:
39
+ """
40
+ Chunk the images into smaller chunks
41
+ Args:
42
+ images: list of images
43
+ Returns:
44
+ chunks: list of chunks of images
45
+ """
46
+ try:
47
+ logger.info("Chunking the images")
48
+ batchSize = self.config["DETAIL EXTRACTOR"]["BATCHSIZE"]
49
+ nBatches = math.ceil(len(images) / batchSize)
50
+ chunks = [images[batchSize * x: batchSize * x + batchSize] for x in range(nBatches)]
51
+ return chunks
52
+ except Exception as e:
53
+ logger.error(f"Error chunking the images: {e}")
54
+ return None
55
+
56
+ def extractDetailsFromChunk(self, images: list[Image.Image]) -> str:
57
+ """
58
+ Extract details from a chunk of images
59
+ Args:
60
+ images: list of images
61
+ Returns:
62
+ details: string of details extracted from the images
63
+ """
64
+ try:
65
+ logger.info("Extracting details from the images")
66
+ completion = self.llmClient.chat.completions.create(
67
+ model = self.config["DETAIL EXTRACTOR"]["VLM"],
68
+ messages = [
69
+ {"role": "system", "content": self.prompts["detailExtractorPrompt"]},
70
+ {"role": "user", "content": [{"type": "image_url", "image_url": {"url": convertImageToBase64(image)}} for image in images]}
71
+ ],
72
+ temperature = self.config["DETAIL EXTRACTOR"]["TEMPERATURE"],
73
+ max_tokens = self.config["DETAIL EXTRACTOR"]["MAXTOKENS"],
74
+ stream = False
75
+ )
76
+ response = completion.choices[0].message.content
77
+ return response
78
+ except Exception as e:
79
+ logger.error(f"Error extracting details from the images: {e}")
80
+ return None
src/components/summaryEngine.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..utils.functions import getConfig, getYaml
2
+ from ..utils.logger import logger
3
+ import litellm
4
+ import os
5
+
6
+ class SummaryEngine:
7
+ def __init__(self):
8
+ logger.info("INITIALIZING SUMMARY ENGINE")
9
+ self.config = getConfig(os.path.join(os.getcwd(), "config.ini"))
10
+ self.prompts = getYaml(os.path.join(os.getcwd(), "prompts.yaml"))
11
+
12
+ def summarize(self, texts: list[str]) -> str:
13
+ """
14
+ Summarize a text
15
+ Args:
16
+ texts: list of texts to summarize
17
+ Returns:
18
+ summary: summary of the texts
19
+ """
20
+ try:
21
+ logger.info("Summarizing the details extracted from the images")
22
+ allSummaries = "\n".join(texts)
23
+ completion = litellm.completion(
24
+ model = self.config["SUMMARIZER"]["LLM"],
25
+ api_key = os.environ["GROQ_API_KEY"],
26
+ api_base = self.config["GROQ CONFIG"]["BASEURL"],
27
+ messages = [
28
+ {"role": "system", "content": self.prompts["summaryEnginePrompt"]},
29
+ {"role": "user", "content": f"AGGEREGATED SUMMARIES: {allSummaries}"}
30
+ ],
31
+ max_tokens = self.config["SUMMARIZER"]["MAXTOKENS"],
32
+ temperature = self.config["SUMMARIZER"]["TEMPERATURE"]
33
+ )
34
+ response = completion["choices"][0]["message"]["content"]
35
+ logger.info("Summary generated successfully")
36
+ return response
37
+ except Exception as e:
38
+ logger.error(f"Error summarizing the text: {e}")
39
+ return None
src/pipelines/__init__.py ADDED
File without changes
src/pipelines/pipeline.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..components.extractPdfDetails import ExtractPdfDetails
2
+ from ..components.summaryEngine import SummaryEngine
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from ..utils.logger import logger
5
+
6
+ class Pipeline:
7
+ def __init__(self):
8
+ logger.info("INITIALIZING PIPELINE")
9
+ self.extractPdfDetails = ExtractPdfDetails()
10
+ self.summaryEngine = SummaryEngine()
11
+
12
+ def run(self, pdfBytes: bytes) -> str:
13
+ """
14
+ Run the pipeline
15
+ Args:
16
+ pdfBytes: bytes of the pdf file
17
+ Returns:
18
+ summary: summary of the pdf file
19
+ """
20
+ try:
21
+ logger.info("Running the pipeline")
22
+ images = self.extractPdfDetails.convertToImages(pdfBytes = pdfBytes)
23
+ chunks = self.extractPdfDetails.chunkImages(images = images)
24
+ with ThreadPoolExecutor(max_workers = 30) as executor:
25
+ futures = [executor.submit(self.extractPdfDetails.extractDetailsFromChunk, chunk) for chunk in chunks]
26
+ summaries = [future.result() for future in futures]
27
+ summary = self.summaryEngine.summarize(texts = summaries)
28
+ return summary
29
+ except Exception as e:
30
+ logger.error(f"Error running the pipeline: {e}")
31
+ return None
src/utils/__init__.py ADDED
File without changes
src/utils/functions.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ from PIL import Image
3
+ import configparser
4
+ import base64
5
+ import yaml
6
+
7
+ def getConfig(configFilePath: str) -> configparser.ConfigParser:
8
+ """
9
+ Get the config from the config file
10
+ Args:
11
+ configFilePath: path to the config file
12
+ Returns:
13
+ config: config parser object
14
+ """
15
+ config = configparser.ConfigParser()
16
+ config.read(configFilePath)
17
+ return config
18
+
19
+ def convertImageToBase64(image: Image.Image) -> str:
20
+ """
21
+ Convert an image to a base64 string
22
+ Args:
23
+ image: image object
24
+ Returns:
25
+ imageString: base64 string of the image
26
+ """
27
+ buffered = BytesIO()
28
+ image.save(buffered, format = "JPEG", optimize = True, quality = 85)
29
+ imageString = base64.b64encode(buffered.getvalue()).decode("utf-8")
30
+ return imageString
31
+
32
+ def getYaml(yamlFilePath: str) -> dict:
33
+ """
34
+ Get the yaml from the yaml file
35
+ Args:
36
+ yamlFilePath: path to the yaml file
37
+ Returns:
38
+ yamlData: yaml data
39
+ """
40
+ with open(yamlFilePath, "r") as file:
41
+ yamlData = yaml.safe_load(file)
42
+ return yamlData
src/utils/logger.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ import sys
3
+ import os
4
+
5
+ logFilePath = os.path.join(os.getcwd(), "logs", "runningLogs.log")
6
+ logFormat = "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
7
+
8
+ logger.remove()
9
+ logger.add(sys.stdout, colorize = True, format = logFormat)
10
+ logger.add(logFilePath, format = logFormat, enqueue = True, mode = "w")
uv.lock ADDED
The diff for this file is too large to render. See raw diff