CurioChen commited on
Commit
48926f3
·
verified ·
1 Parent(s): 2d6d56a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +244 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import requests
4
+ import datetime
5
+ import hashlib
6
+ import hmac
7
+ import logging
8
+ import ntplib
9
+ import time
10
+ import os
11
+ import tempfile
12
+ import io
13
+ from openai import OpenAI
14
+ from openpyxl import Workbook
15
+ import gradio as gr
16
+ import re
17
+ import fitz # PyMuPDF
18
+ import pandas as pd
19
+
20
+ # Configure logging
21
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
22
+
23
+ # Get configuration from environment variables
24
+ SECRET_ID = os.getenv("SECRET_ID", "AKID9EGD5tdKtpq5V1pkfbkwcJLOLEFVnJwp")
25
+ SECRET_KEY = os.getenv("SECRET_KEY", "374ugKueFkK7DFA62675Gk9TizCGA49A")
26
+ REGION = os.getenv("REGION", "ap-guangzhou")
27
+ ENDPOINT = os.getenv("ENDPOINT", "lke.tencentcloudapi.com")
28
+ SERVICE = "lke"
29
+ ACTION = "ReconstructDocument"
30
+ VERSION = "2023-11-30"
31
+
32
+ # OpenAI API key
33
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY",
34
+ "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")
35
+
36
+
37
+ # Get NTP time
38
+ def get_ntp_time():
39
+ ntp_client = ntplib.NTPClient()
40
+ try:
41
+ response = ntp_client.request('pool.ntp.org', version=3, timeout=5)
42
+ return datetime.datetime.fromtimestamp(response.tx_time, datetime.timezone.utc)
43
+ except Exception as e:
44
+ logging.warning(f"Unable to get NTP time, using local time: {e}")
45
+ return datetime.datetime.now(datetime.timezone.utc)
46
+
47
+
48
+ # Signing function
49
+ def sign(key, msg):
50
+ return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
51
+
52
+
53
+ # Get authentication information
54
+ def get_auth(secret_id, secret_key, host, method, params, headers):
55
+ algorithm = "TC3-HMAC-SHA256"
56
+ ntp_time = get_ntp_time()
57
+ timestamp = int(ntp_time.timestamp())
58
+ date = ntp_time.strftime('%Y-%m-%d')
59
+
60
+ http_request_method = method.upper()
61
+ canonical_uri = "/"
62
+ canonical_querystring = ""
63
+ ct = headers.get("content-type", "application/x-www-form-urlencoded")
64
+ payload = json.dumps(params)
65
+ canonical_headers = f"content-type:{ct}\nhost:{host}\n"
66
+ signed_headers = "content-type;host"
67
+ hashed_request_payload = hashlib.sha256(payload.encode("utf-8")).hexdigest()
68
+ canonical_request = (f"{http_request_method}\n{canonical_uri}\n{canonical_querystring}\n"
69
+ f"{canonical_headers}\n{signed_headers}\n{hashed_request_payload}")
70
+
71
+ credential_scope = f"{date}/{SERVICE}/tc3_request"
72
+ hashed_canonical_request = hashlib.sha256(canonical_request.encode("utf-8")).hexdigest()
73
+ string_to_sign = (f"{algorithm}\n{timestamp}\n{credential_scope}\n{hashed_canonical_request}")
74
+
75
+ secret_date = sign(f"TC3{secret_key}".encode("utf-8"), date)
76
+ secret_service = sign(secret_date, SERVICE)
77
+ secret_signing = sign(secret_service, "tc3_request")
78
+ signature = hmac.new(secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest()
79
+
80
+ authorization = (f"{algorithm} Credential={secret_id}/{credential_scope}, "
81
+ f"SignedHeaders={signed_headers}, Signature={signature}")
82
+
83
+ return {
84
+ "Authorization": authorization,
85
+ "Host": host,
86
+ "Content-Type": ct,
87
+ "X-TC-Timestamp": str(timestamp),
88
+ "X-TC-Version": VERSION,
89
+ "X-TC-Action": ACTION,
90
+ "X-TC-Region": REGION
91
+ }
92
+
93
+
94
+ # Extract information
95
+ def extract_information(content):
96
+ client = OpenAI(api_key=OPENAI_API_KEY)
97
+
98
+ prompt = (
99
+ "There are some guides, respond in detailed content, respond without content in (), JSON begin with contracts value:\n"
100
+ "1. Contract awarded date\n"
101
+ "2. Construction location (This part of the content is in the title, not in the table; the address must be returned and should be detailed.)\n"
102
+ "3. Tender reference\n"
103
+ "4. Construction summary (in the 'particular' section)\n"
104
+ "5. Contractor\n"
105
+ "6. Contractor address(this is not company name, the address must be returned and should be detailed.)\n"
106
+ "7. Amount\n"
107
+ "8. Notice publish date (at the end of the content)"
108
+ )
109
+
110
+ for attempt in range(3): # Try three times
111
+ try:
112
+ logging.info(f"Extracting information (Attempt {attempt + 1}/3)")
113
+ response = client.chat.completions.create(
114
+ model="gpt-4o",
115
+ messages=[
116
+ {"role": "system", "content": "You are a helpful assistant designed to output JSON"},
117
+ {"role": "user", "content": f"{prompt}\n\n{content}"}
118
+ ],
119
+ response_format={"type": "json_object"}
120
+ )
121
+
122
+ if response.choices[0].finish_reason == "stop":
123
+ extracted_info = json.loads(response.choices[0].message.content)
124
+ return json.dumps(extracted_info, ensure_ascii=False, indent=4)
125
+ else:
126
+ logging.warning(f"Warning: Unexpected completion reason - {response.choices[0].finish_reason}")
127
+ except Exception as e:
128
+ logging.error(f"Error: API call failed - {str(e)}")
129
+
130
+ if attempt < 2: # If not the last attempt, wait before retrying
131
+ time.sleep(5)
132
+
133
+ return None # If all three attempts fail, return None.
134
+
135
+
136
+ # JSON to Excel
137
+ def json_to_excel(json_data):
138
+ data = json.loads(json_data)
139
+
140
+ wb = Workbook()
141
+ ws = wb.active
142
+
143
+ headers = ['contract_awarded_date', 'construction_location', 'tender_reference',
144
+ 'construction_summary', 'contractor', 'contractor_address',
145
+ 'amount', 'notice_publish_date']
146
+ ws.append(headers)
147
+
148
+ # 创建一个辅助函数来进行精确匹配
149
+ def exact_match(key, target):
150
+ key = ''.join(c.lower() for c in key if c.isalnum())
151
+ target = ''.join(c.lower() for c in target if c.isalnum())
152
+ return key == target
153
+
154
+ for contract in data['contracts']:
155
+ row = []
156
+ for header in headers:
157
+ # 使用精确匹配来查找对应的值
158
+ matched_value = next((v for k, v in contract.items() if exact_match(header, k)), '')
159
+ row.append(matched_value)
160
+ ws.append(row)
161
+
162
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
163
+ wb.save(tmp.name)
164
+ return tmp.name
165
+
166
+ def clean_url(input_text):
167
+ # 去除可能存在的首尾引号
168
+ cleaned_url = input_text.strip().strip('"')
169
+ return cleaned_url
170
+
171
+ # 新增函数:处理上传的PDF文件
172
+ def process_pdf(file):
173
+ logging.info(f"开始处理PDF文件: {type(file)}")
174
+ try:
175
+ if hasattr(file, 'name'):
176
+ # 如果file是一个文件对象
177
+ with fitz.open(file.name) as doc:
178
+ text_content = ""
179
+ for page in doc:
180
+ text_content += page.get_text()
181
+ else:
182
+ # 如果file是一个字符串(文件路径)
183
+ with fitz.open(file) as doc:
184
+ text_content = ""
185
+ for page in doc:
186
+ text_content += page.get_text()
187
+ logging.info("PDF处理成功")
188
+ return text_content
189
+ except Exception as e:
190
+ logging.error(f"PDF处理错误: {str(e)}")
191
+ raise
192
+
193
+ def preview_excel(excel_path):
194
+ try:
195
+ df = pd.read_excel(excel_path, nrows=3)
196
+ preview = df.iloc[:3, :3].to_html(index=False)
197
+ return preview
198
+ except Exception as e:
199
+ logging.error(f"Error previewing Excel: {str(e)}")
200
+ return "Unable to generate preview"
201
+
202
+ def process_pdf_file(file):
203
+ if file is None:
204
+ logging.warning("No file uploaded")
205
+ return "Please upload a PDF file.", None, ""
206
+
207
+ try:
208
+ logging.info(f"Received file: {type(file)}, {file.name if hasattr(file, 'name') else 'No name'}")
209
+ pdf_content = process_pdf(file)
210
+ except Exception as e:
211
+ logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
212
+ return f"Error processing PDF file: {str(e)}", None, ""
213
+
214
+ try:
215
+ json_data = extract_information(pdf_content)
216
+ if json_data is None:
217
+ logging.error("Failed to extract information")
218
+ return "Error extracting information. Please try again later.", None, ""
219
+
220
+ excel_path = json_to_excel(json_data)
221
+ excel_preview = preview_excel(excel_path)
222
+
223
+ logging.info("File processing successful")
224
+ return "Processing successful!", excel_path, excel_preview
225
+ except Exception as e:
226
+ logging.error(f"Error processing file: {str(e)}", exc_info=True)
227
+ return f"Error processing file: {str(e)}", None, ""
228
+
229
+ # Modified Gradio interface
230
+ iface = gr.Interface(
231
+ fn=process_pdf_file,
232
+ inputs=gr.File(label="Upload PDF File", type="filepath", file_types=[".pdf"]),
233
+ outputs=[
234
+ gr.Textbox(label="Processing Status"),
235
+ gr.File(label="Download Excel File"),
236
+ gr.HTML(label="Excel Preview")
237
+ ],
238
+ title="PDF Document Processing and Information Extraction",
239
+ description="Upload a PDF file, and the system will process it and generate an Excel result."
240
+ )
241
+
242
+ # Run Gradio application
243
+ if __name__ == "__main__":
244
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ openai
2
+ openpyxl
3
+ gradio
4
+ PyMuPDF
5
+ pandas
6
+ requests
7
+ ntplib