Gopikanth123 commited on
Commit
7eaea00
·
verified ·
1 Parent(s): da708c5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -0
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import os
3
+ import pandas as pd
4
+ import pdfplumber
5
+ import gradio as gr
6
+ import time
7
+ from pathlib import Path
8
+ import shutil
9
+
10
+ # Function to extract content from PDF
11
+ def extract_pdf_content(file_path):
12
+ # Open the PDF
13
+ pdf_file = fitz.open(file_path)
14
+ page_nums = len(pdf_file)
15
+
16
+ # Ensure images directory exists
17
+ images_dir = "temp_images"
18
+ if os.path.exists(images_dir):
19
+ shutil.rmtree(images_dir) # Clean up previous images
20
+ os.makedirs(images_dir)
21
+
22
+ # Store extracted content
23
+ all_text = []
24
+ all_tables = []
25
+ images_list = []
26
+
27
+ # Extract text, tables, and images
28
+ for page_num in range(page_nums):
29
+ page_content = pdf_file[page_num]
30
+
31
+ # Extract text
32
+ text = page_content.get_text("text")
33
+ all_text.append(f"--- Page {page_num + 1} ---\n{text}")
34
+
35
+ # Extract tables using pdfplumber
36
+ with pdfplumber.open(file_path) as pdf:
37
+ tables = pdf.pages[page_num].extract_tables()
38
+ for table in tables:
39
+ df = pd.DataFrame(table)
40
+ all_tables.append(df)
41
+
42
+ # Extract images
43
+ images_list.extend(page_content.get_images(full=True))
44
+
45
+ # Save extracted images
46
+ image_paths = []
47
+ if images_list:
48
+ for i, image in enumerate(images_list, start=1):
49
+ xref = image[0]
50
+ base_image = pdf_file.extract_image(xref)
51
+ image_bytes = base_image["image"]
52
+ image_ext = base_image["ext"]
53
+ image_name = f"{images_dir}/image_{i}.{image_ext}"
54
+ image_paths.append(image_name)
55
+
56
+ with open(image_name, "wb") as image_file:
57
+ image_file.write(image_bytes)
58
+
59
+ # Close the PDF file
60
+ pdf_file.close()
61
+
62
+ return "\n".join(all_text), all_tables, image_paths
63
+
64
+ # Gradio Interface
65
+ def display_pdf_content(file_path, progress=gr.Progress()):
66
+ # Extract content with progress updates
67
+ progress(0, desc="Starting extraction...")
68
+ time.sleep(1)
69
+ progress(0.25, desc="Extracting text...")
70
+ text, tables, images = extract_pdf_content(file_path)
71
+ progress(0.5, desc="Extracting tables...")
72
+ time.sleep(1)
73
+ progress(0.75, desc="Extracting images...")
74
+ time.sleep(1)
75
+ progress(1.0, desc="Extraction complete!")
76
+
77
+ # Convert tables to HTML with advanced styling
78
+ table_html = ""
79
+ for idx, table in enumerate(tables):
80
+ table_html += f"<h3>Table {idx + 1}</h3>"
81
+ table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered")
82
+
83
+ # Return outputs
84
+ return text, table_html, images
85
+
86
+ # Custom CSS for advanced styling
87
+ custom_css = """
88
+ .gradio-container {
89
+ max-width: 1200px;
90
+ margin: auto;
91
+ }
92
+ .table {
93
+ width: 100%;
94
+ margin-bottom: 1rem;
95
+ color: #212529;
96
+ }
97
+ .table-striped tbody tr:nth-of-type(odd) {
98
+ background-color: rgba(0, 0, 0, 0.05);
99
+ }
100
+ .table-bordered {
101
+ border: 1px solid #dee2e6;
102
+ }
103
+ .table-bordered th,
104
+ .table-bordered td {
105
+ border: 1px solid #dee2e6;
106
+ }
107
+ .gallery {
108
+ display: flex;
109
+ flex-wrap: wrap;
110
+ gap: 10px;
111
+ }
112
+ .gallery img {
113
+ max-width: 100%;
114
+ height: auto;
115
+ border-radius: 5px;
116
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
117
+ }
118
+ .scrollable {
119
+ max-height: 400px;
120
+ overflow-y: auto;
121
+ border: 1px solid #ddd;
122
+ padding: 10px;
123
+ border-radius: 5px;
124
+ }
125
+ .row {
126
+ display: flex;
127
+ gap: 20px;
128
+ margin-bottom: 20px;
129
+ }
130
+ .column {
131
+ flex: 1;
132
+ }
133
+ .center {
134
+ text-align: center;
135
+ margin: auto;
136
+ width: 80%;
137
+ }
138
+ """
139
+
140
+ # Create Gradio Interface
141
+ with gr.Blocks(css=custom_css) as demo:
142
+ gr.Markdown("# Advanced PDF Content Extractor")
143
+ with gr.Row():
144
+ pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
145
+ with gr.Row():
146
+ with gr.Column():
147
+ gr.Markdown("### Extracted Text")
148
+ text_output = gr.Textbox(label="Text", lines=15, interactive=False, elem_classes="scrollable")
149
+ with gr.Column():
150
+ gr.Markdown("### Extracted Images")
151
+ image_gallery = gr.Gallery(label="Images", columns=4, height="auto", elem_classes="scrollable")
152
+ with gr.Row():
153
+ with gr.Column():
154
+ gr.Markdown("### Extracted Tables")
155
+ table_output = gr.HTML(label="Tables", elem_classes="scrollable center")
156
+
157
+ # Main function call
158
+ pdf_input.change(
159
+ fn=display_pdf_content,
160
+ inputs=pdf_input,
161
+ outputs=[text_output, table_output, image_gallery]
162
+ )
163
+
164
+ # Launch the Gradio app
165
+ demo.launch()