CurioChen commited on
Commit
97d42a7
·
verified ·
1 Parent(s): 97e7c1f

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -247
app.py DELETED
@@ -1,247 +0,0 @@
1
- import base64
2
- import json
3
- import requests
4
- import datetime
5
- import hashlib
6
- import hmac
7
- import logging
8
- import ntplib
9
- import time
10
- import os
11
- import tempfile
12
- import io
13
- from openai import OpenAI
14
- from openpyxl import Workbook
15
- import gradio as gr
16
- import re
17
- import fitz # PyMuPDF
18
- import pandas as pd
19
- from gradio_pdf import PDF # Import the new PDF component
20
-
21
- # Configure logging
22
- logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
23
-
24
- # Get configuration from environment variables
25
- SECRET_ID = os.getenv("SECRET_ID", "AKID9EGD5tdKtpq5V1pkfbkwcJLOLEFVnJwp")
26
- SECRET_KEY = os.getenv("SECRET_KEY", "374ugKueFkK7DFA62675Gk9TizCGA49A")
27
- REGION = os.getenv("REGION", "ap-guangzhou")
28
- ENDPOINT = os.getenv("ENDPOINT", "lke.tencentcloudapi.com")
29
- SERVICE = "lke"
30
- ACTION = "ReconstructDocument"
31
- VERSION = "2023-11-30"
32
-
33
- # OpenAI API key
34
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY",
35
- "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")
36
-
37
-
38
- # Get NTP time
39
- def get_ntp_time():
40
- ntp_client = ntplib.NTPClient()
41
- try:
42
- response = ntp_client.request('pool.ntp.org', version=3, timeout=5)
43
- return datetime.datetime.fromtimestamp(response.tx_time, datetime.timezone.utc)
44
- except Exception as e:
45
- logging.warning(f"Unable to get NTP time, using local time: {e}")
46
- return datetime.datetime.now(datetime.timezone.utc)
47
-
48
-
49
- # Signing function
50
- def sign(key, msg):
51
- return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
52
-
53
-
54
- # Get authentication information
55
- def get_auth(secret_id, secret_key, host, method, params, headers):
56
- algorithm = "TC3-HMAC-SHA256"
57
- ntp_time = get_ntp_time()
58
- timestamp = int(ntp_time.timestamp())
59
- date = ntp_time.strftime('%Y-%m-%d')
60
-
61
- http_request_method = method.upper()
62
- canonical_uri = "/"
63
- canonical_querystring = ""
64
- ct = headers.get("content-type", "application/x-www-form-urlencoded")
65
- payload = json.dumps(params)
66
- canonical_headers = f"content-type:{ct}\nhost:{host}\n"
67
- signed_headers = "content-type;host"
68
- hashed_request_payload = hashlib.sha256(payload.encode("utf-8")).hexdigest()
69
- canonical_request = (f"{http_request_method}\n{canonical_uri}\n{canonical_querystring}\n"
70
- f"{canonical_headers}\n{signed_headers}\n{hashed_request_payload}")
71
-
72
- credential_scope = f"{date}/{SERVICE}/tc3_request"
73
- hashed_canonical_request = hashlib.sha256(canonical_request.encode("utf-8")).hexdigest()
74
- string_to_sign = (f"{algorithm}\n{timestamp}\n{credential_scope}\n{hashed_canonical_request}")
75
-
76
- secret_date = sign(f"TC3{secret_key}".encode("utf-8"), date)
77
- secret_service = sign(secret_date, SERVICE)
78
- secret_signing = sign(secret_service, "tc3_request")
79
- signature = hmac.new(secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest()
80
-
81
- authorization = (f"{algorithm} Credential={secret_id}/{credential_scope}, "
82
- f"SignedHeaders={signed_headers}, Signature={signature}")
83
-
84
- return {
85
- "Authorization": authorization,
86
- "Host": host,
87
- "Content-Type": ct,
88
- "X-TC-Timestamp": str(timestamp),
89
- "X-TC-Version": VERSION,
90
- "X-TC-Action": ACTION,
91
- "X-TC-Region": REGION
92
- }
93
-
94
-
95
- # Extract information
96
- def extract_information(content):
97
- client = OpenAI(api_key=OPENAI_API_KEY)
98
-
99
- prompt = (
100
- "There are some guides, respond in detailed content, respond without content in (), JSON begin with contracts value:\n"
101
- "1. Contract awarded date\n"
102
- "2. Construction location (This part of the content is in the title, not in the table; the address must be returned and should be detailed.)\n"
103
- "3. Tender reference\n"
104
- "4. Construction summary (in the 'particular' section)\n"
105
- "5. Contractor\n"
106
- "6. Contractor address(this is not company name, the address must be returned and should be detailed.)\n"
107
- "7. Amount\n"
108
- "8. Notice publish date (at the end of the content)"
109
- )
110
-
111
- for attempt in range(3): # Try three times
112
- try:
113
- logging.info(f"Extracting information (Attempt {attempt + 1}/3)")
114
- response = client.chat.completions.create(
115
- model="gpt-4o",
116
- messages=[
117
- {"role": "system", "content": "You are a helpful assistant designed to output JSON"},
118
- {"role": "user", "content": f"{prompt}\n\n{content}"}
119
- ],
120
- response_format={"type": "json_object"}
121
- )
122
-
123
- if response.choices[0].finish_reason == "stop":
124
- extracted_info = json.loads(response.choices[0].message.content)
125
- return json.dumps(extracted_info, ensure_ascii=False, indent=4)
126
- else:
127
- logging.warning(f"Warning: Unexpected completion reason - {response.choices[0].finish_reason}")
128
- except Exception as e:
129
- logging.error(f"Error: API call failed - {str(e)}")
130
-
131
- if attempt < 2: # If not the last attempt, wait before retrying
132
- time.sleep(5)
133
-
134
- return None # If all three attempts fail, return None.
135
-
136
-
137
- # JSON to Excel
138
- def json_to_excel(json_data):
139
- data = json.loads(json_data)
140
-
141
- wb = Workbook()
142
- ws = wb.active
143
-
144
- headers = ['contract_awarded_date', 'construction_location', 'tender_reference',
145
- 'construction_summary', 'contractor', 'contractor_address',
146
- 'amount', 'notice_publish_date']
147
- ws.append(headers)
148
-
149
- # Create a helper function for exact matching
150
- def exact_match(key, target):
151
- key = ''.join(c.lower() for c in key if c.isalnum())
152
- target = ''.join(c.lower() for c in target if c.isalnum())
153
- return key == target
154
-
155
- for contract in data['contracts']:
156
- row = []
157
- for header in headers:
158
- # Use exact matching to find the corresponding value
159
- matched_value = next((v for k, v in contract.items() if exact_match(header, k)), '')
160
- row.append(matched_value)
161
- ws.append(row)
162
-
163
- with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
164
- wb.save(tmp.name)
165
- return tmp.name
166
-
167
- def clean_url(input_text):
168
- # Remove any leading or trailing quotes
169
- cleaned_url = input_text.strip().strip('"')
170
- return cleaned_url
171
-
172
- # New function: Process uploaded PDF
173
- def process_pdf(file):
174
- logging.info(f"Start processing PDF file: {type(file)}")
175
- try:
176
- if hasattr(file, 'name'):
177
- # If file is a file object
178
- with fitz.open(file.name) as doc:
179
- text_content = ""
180
- for page in doc:
181
- text_content += page.get_text()
182
- else:
183
- # If file is a string (file path)
184
- with fitz.open(file) as doc:
185
- text_content = ""
186
- for page in doc:
187
- text_content += page.get_text()
188
- logging.info("PDF processing successful")
189
- return text_content
190
- except Exception as e:
191
- logging.error(f"PDF processing error: {str(e)}")
192
- raise
193
-
194
- def preview_excel(excel_path):
195
- try:
196
- df = pd.read_excel(excel_path, nrows=10)
197
- preview_df = df.iloc[:10, :8]
198
- return gr.Dataframe(value=preview_df)
199
- except Exception as e:
200
- logging.error(f"Excel preview error: {str(e)}")
201
- return gr.Dataframe()
202
-
203
- def process_pdf_file(file):
204
- if file is None:
205
- logging.warning("No file uploaded")
206
- return "Please upload a PDF file.", None, gr.Dataframe()
207
-
208
- try:
209
- logging.info(f"Received file: {type(file)}, {file.name if hasattr(file, 'name') else 'No name'}")
210
- pdf_content = process_pdf(file)
211
- except Exception as e:
212
- logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
213
- return f"Error processing PDF file: {str(e)}", None, gr.Dataframe()
214
-
215
- try:
216
- json_data = extract_information(pdf_content)
217
- if json_data is None:
218
- logging.error("Failed to extract information")
219
- return "Error extracting information. Please try again later.", None, gr.Dataframe()
220
-
221
- excel_path = json_to_excel(json_data)
222
- excel_preview = preview_excel(excel_path)
223
-
224
- logging.info("File processing successful")
225
- return "Processing successful!", excel_path, excel_preview
226
- except Exception as e:
227
- logging.error(f"Error processing file: {str(e)}", exc_info=True)
228
- return f"Error processing file: {str(e)}", None, gr.Dataframe()
229
-
230
- # Gradio interface
231
- iface = gr.Interface(
232
- fn=process_pdf_file,
233
- inputs=[
234
- PDF(label="Upload PDF File") # Only keep the label parameter
235
- ],
236
- outputs=[
237
- gr.Textbox(label="Processing Status"),
238
- gr.File(label="Download Excel File"),
239
- gr.Dataframe(label="Excel Preview (First 10 rows, 8 columns)")
240
- ],
241
- title="PDF Document Processing and Information Extraction",
242
- description="Upload a PDF file, and the system will process it and generate an Excel result."
243
- )
244
-
245
- # Run the Gradio app
246
- if __name__ == "__main__":
247
- iface.launch()