chburhan64 commited on
Commit
62be563
·
verified ·
1 Parent(s): d0f2866

Upload visualization_module.py

Browse files
Files changed (1) hide show
  1. visualization_module.py +277 -0
visualization_module.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from plotly.subplots import make_subplots
7
+ import numpy as np
8
+ from langchain.chains import LLMChain
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ import pdfplumber
11
+ import io
12
+
13
+ def get_data_extraction_prompt():
14
+ """Get the prompt template for data extraction"""
15
+ return ChatPromptTemplate.from_template("""
16
+ Analyze the following text and extract all numerical data, statistics, and measurements.
17
+ Focus on:
18
+ - Tables with numerical values
19
+ - Statistical results (percentages, counts, means, etc.)
20
+ - Experimental data and measurements
21
+ - Survey results and responses
22
+ - Performance metrics and comparisons
23
+
24
+ Text to analyze:
25
+ {text}
26
+
27
+ Please extract the data in a structured format and identify what each number represents.
28
+ """)
29
+
30
+ def get_chart_analysis_prompt():
31
+ """Get the prompt template for chart analysis"""
32
+ return ChatPromptTemplate.from_template("""
33
+ Analyze the following data visualization and provide insights:
34
+
35
+ Data Summary: {data_summary}
36
+ Chart Type: {chart_type}
37
+
38
+ Please provide:
39
+ 1. Key trends and patterns visible in the data
40
+ 2. Statistical significance or notable findings
41
+ 3. Implications for the research
42
+ 4. Any surprising or important insights
43
+
44
+ Keep the analysis concise but informative.
45
+ """)
46
+
47
+ def extract_numerical_data_from_pdf(uploaded_files):
48
+ """
49
+ Extract numerical data from PDF files using pdfplumber
50
+
51
+ Args:
52
+ uploaded_files: List of uploaded PDF files
53
+
54
+ Returns:
55
+ dict: Extracted numerical data and text
56
+ """
57
+ extracted_data = {
58
+ 'tables': [],
59
+ 'numerical_text': [],
60
+ 'raw_text': ''
61
+ }
62
+
63
+ for file in uploaded_files:
64
+ # Reset file pointer
65
+ file.seek(0)
66
+
67
+ with pdfplumber.open(io.BytesIO(file.read())) as pdf:
68
+ full_text = ""
69
+
70
+ for page in pdf.pages:
71
+ # Extract text
72
+ text = page.extract_text()
73
+ if text:
74
+ full_text += text + "\n"
75
+
76
+ # Extract tables
77
+ tables = page.extract_tables()
78
+ if tables:
79
+ for table in tables:
80
+ # Convert table to DataFrame if it has data
81
+ if table and len(table) > 1:
82
+ try:
83
+ df = pd.DataFrame(table[1:], columns=table[0])
84
+ extracted_data['tables'].append(df)
85
+ except:
86
+ pass
87
+
88
+ extracted_data['raw_text'] = full_text
89
+
90
+ return extracted_data
91
+
92
+ def extract_numbers_with_regex(text):
93
+ """
94
+ Extract numerical patterns from text using regex
95
+
96
+ Args:
97
+ text: Text to analyze
98
+
99
+ Returns:
100
+ list: List of found numerical patterns
101
+ """
102
+ patterns = [
103
+ r'\b\d+\.?\d*%', # Percentages
104
+ r'\b\d+\.?\d*\s*(?:participants|subjects|samples|cases)', # Sample sizes
105
+ r'p\s*[<>=]\s*\d+\.?\d*', # P-values
106
+ r'\b\d+\.?\d*\s*±\s*\d+\.?\d*', # Mean ± SD
107
+ r'\$\d+\.?\d*[MBK]?', # Currency
108
+ r'\b\d{4}\b', # Years
109
+ r'\b\d+\.?\d*\s*(?:kg|g|cm|m|mm|seconds?|minutes?|hours?|days?)', # Units
110
+ ]
111
+
112
+ found_numbers = []
113
+ for pattern in patterns:
114
+ matches = re.findall(pattern, text, re.IGNORECASE)
115
+ found_numbers.extend(matches)
116
+
117
+ return found_numbers
118
+
119
+ def create_sample_charts(extracted_data):
120
+ """
121
+ Create sample charts from extracted data
122
+
123
+ Args:
124
+ extracted_data: Dictionary containing extracted data
125
+
126
+ Returns:
127
+ tuple: (matplotlib figure, plotly figure, data summary)
128
+ """
129
+ # Try to create charts from tables first
130
+ if extracted_data['tables']:
131
+ df = extracted_data['tables'][0] # Use first table
132
+
133
+ # Find numerical columns
134
+ numerical_cols = []
135
+ for col in df.columns:
136
+ try:
137
+ # Try to convert to numeric
138
+ pd.to_numeric(df[col], errors='coerce')
139
+ if not df[col].isna().all():
140
+ numerical_cols.append(col)
141
+ except:
142
+ pass
143
+
144
+ if len(numerical_cols) >= 1:
145
+ # Create bar chart with plotly
146
+ fig_plotly = px.bar(
147
+ df,
148
+ x=df.columns[0],
149
+ y=numerical_cols[0] if numerical_cols else df.columns[1],
150
+ title="Data from Research Paper",
151
+ color_discrete_sequence=['#1f77b4']
152
+ )
153
+
154
+ # Create matplotlib chart
155
+ fig_mpl, ax = plt.subplots(figsize=(10, 6))
156
+ if len(df) <= 20: # Only plot if reasonable number of rows
157
+ ax.bar(range(len(df)), pd.to_numeric(df[numerical_cols[0]], errors='coerce').fillna(0))
158
+ ax.set_title('Extracted Data Visualization')
159
+ ax.set_xlabel('Data Points')
160
+ ax.set_ylabel('Values')
161
+ else:
162
+ ax.text(0.5, 0.5, 'Too many data points to display',
163
+ transform=ax.transAxes, ha='center', va='center')
164
+ ax.set_title('Data Available (Too Large to Display)')
165
+
166
+ data_summary = f"Extracted table with {len(df)} rows and {len(df.columns)} columns. Numerical columns: {numerical_cols}"
167
+
168
+ return fig_mpl, fig_plotly, data_summary
169
+
170
+ # If no tables, try to create chart from regex-extracted numbers
171
+ numbers = extract_numbers_with_regex(extracted_data['raw_text'])
172
+
173
+ if numbers:
174
+ # Extract just percentages for a simple chart
175
+ percentages = [float(re.findall(r'\d+\.?\d*', num)[0]) for num in numbers if '%' in num]
176
+
177
+ if len(percentages) >= 2:
178
+ # Create simple bar chart
179
+ fig_mpl, ax = plt.subplots(figsize=(10, 6))
180
+ ax.bar(range(len(percentages[:10])), percentages[:10])
181
+ ax.set_title('Extracted Percentages from Text')
182
+ ax.set_xlabel('Data Points')
183
+ ax.set_ylabel('Percentage (%)')
184
+
185
+ # Plotly version
186
+ fig_plotly = px.bar(
187
+ x=list(range(len(percentages[:10]))),
188
+ y=percentages[:10],
189
+ title="Extracted Percentages from Text",
190
+ labels={'x': 'Data Points', 'y': 'Percentage (%)'}
191
+ )
192
+
193
+ data_summary = f"Extracted {len(percentages)} percentage values from text. Showing first 10."
194
+
195
+ return fig_mpl, fig_plotly, data_summary
196
+
197
+ # Default case - create a simple info chart
198
+ fig_mpl, ax = plt.subplots(figsize=(10, 6))
199
+ ax.text(0.5, 0.5, 'No suitable numerical data found for visualization\nTry uploading a PDF with tables or statistical data',
200
+ transform=ax.transAxes, ha='center', va='center', fontsize=12)
201
+ ax.set_title('Data Extraction Result')
202
+ ax.axis('off')
203
+
204
+ fig_plotly = go.Figure()
205
+ fig_plotly.add_annotation(
206
+ text="No suitable numerical data found for visualization<br>Try uploading a PDF with tables or statistical data",
207
+ xref="paper", yref="paper",
208
+ x=0.5, y=0.5, xanchor='center', yanchor='middle',
209
+ showarrow=False, font=dict(size=16)
210
+ )
211
+ fig_plotly.update_layout(title="Data Extraction Result")
212
+
213
+ data_summary = "No suitable numerical data found in the document for visualization."
214
+
215
+ return fig_mpl, fig_plotly, data_summary
216
+
217
+ def generate_visual_insights(llm, uploaded_files):
218
+ """
219
+ Generate visual insights from PDF data
220
+
221
+ Args:
222
+ llm: Language model instance
223
+ uploaded_files: List of uploaded PDF files
224
+
225
+ Returns:
226
+ dict: Contains charts and AI analysis
227
+ """
228
+ try:
229
+ # Extract data from PDF
230
+ extracted_data = extract_numerical_data_from_pdf(uploaded_files)
231
+
232
+ # Create charts
233
+ fig_mpl, fig_plotly, data_summary = create_sample_charts(extracted_data)
234
+
235
+ # Generate AI analysis
236
+ analysis_prompt = get_chart_analysis_prompt()
237
+ analysis_chain = LLMChain(llm=llm, prompt=analysis_prompt)
238
+
239
+ ai_analysis = analysis_chain.invoke({
240
+ "data_summary": data_summary,
241
+ "chart_type": "Bar Chart/Data Visualization"
242
+ })
243
+
244
+ return {
245
+ 'matplotlib_fig': fig_mpl,
246
+ 'plotly_fig': fig_plotly,
247
+ 'data_summary': data_summary,
248
+ 'ai_analysis': ai_analysis,
249
+ 'extracted_numbers': extract_numbers_with_regex(extracted_data['raw_text'][:2000]), # First 2000 chars
250
+ 'tables_found': len(extracted_data['tables'])
251
+ }
252
+
253
+ except Exception as e:
254
+ # Create error chart
255
+ fig_mpl, ax = plt.subplots(figsize=(10, 6))
256
+ ax.text(0.5, 0.5, f'Error processing PDF: {str(e)}\nPlease try with a different PDF file',
257
+ transform=ax.transAxes, ha='center', va='center', fontsize=12)
258
+ ax.set_title('Processing Error')
259
+ ax.axis('off')
260
+
261
+ fig_plotly = go.Figure()
262
+ fig_plotly.add_annotation(
263
+ text=f"Error processing PDF: {str(e)}<br>Please try with a different PDF file",
264
+ xref="paper", yref="paper",
265
+ x=0.5, y=0.5, xanchor='center', yanchor='middle',
266
+ showarrow=False, font=dict(size=16)
267
+ )
268
+ fig_plotly.update_layout(title="Processing Error")
269
+
270
+ return {
271
+ 'matplotlib_fig': fig_mpl,
272
+ 'plotly_fig': fig_plotly,
273
+ 'data_summary': f"Error: {str(e)}",
274
+ 'ai_analysis': "Unable to analyze due to processing error.",
275
+ 'extracted_numbers': [],
276
+ 'tables_found': 0
277
+ }