Spaces:
Sleeping
Sleeping
pdf_download_csv_output
Browse files
app.py
CHANGED
|
@@ -347,39 +347,37 @@ def convert_pdf_to_images(pdf_path):
|
|
| 347 |
|
| 348 |
def process_pdf_to_data(password, pdf_file):
|
| 349 |
if password != PASSWORD:
|
| 350 |
-
raise gr.Error("
|
| 351 |
|
| 352 |
processed_data = []
|
| 353 |
question_count = 0
|
| 354 |
|
| 355 |
pdf_image_paths = convert_pdf_to_images(pdf_file.name)
|
| 356 |
-
for pdf_image_path in pdf_image_paths
|
| 357 |
-
|
| 358 |
-
image_url = upload_image_to_gcs(pdf_image_path, bucket)
|
| 359 |
-
text = pdf_image_to_text(image_url)
|
| 360 |
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
|
| 377 |
result = f"PDF ่็ๅฎๆ๏ผ็ธฝๅ
ฑๅฎๆ {question_count} ้้ก็ฎ"
|
| 378 |
csv_file_path = create_csv(processed_data)
|
| 379 |
|
| 380 |
return processed_data, result, csv_file_path
|
| 381 |
|
| 382 |
-
def pdf_image_to_text(
|
| 383 |
user_prompt = """
|
| 384 |
่ซ่งฃ่ฎ้ก็ฎๅ็:
|
| 385 |
- ๅ็่ซไธๅฎ่ฆ็จ zh-TW ่งฃ่ฎ
|
|
@@ -405,25 +403,31 @@ def pdf_image_to_text(image_url):
|
|
| 405 |
]
|
| 406 |
"""
|
| 407 |
|
| 408 |
-
|
| 409 |
-
model="gpt-4o",
|
| 410 |
-
messages=[
|
| 411 |
{
|
| 412 |
"role": "user",
|
| 413 |
"content": [
|
| 414 |
{
|
| 415 |
"type": "text",
|
| 416 |
"text": user_prompt
|
| 417 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
{
|
| 419 |
"type": "image_url",
|
| 420 |
"image_url": {
|
| 421 |
"url": image_url,
|
| 422 |
},
|
| 423 |
-
}
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
|
|
|
|
|
|
| 427 |
max_tokens=4000,
|
| 428 |
)
|
| 429 |
return response.choices[0].message.content
|
|
@@ -442,23 +446,68 @@ def safe_json_loads(json_string):
|
|
| 442 |
print(f"Second JSONDecodeError: {e2}")
|
| 443 |
raise e2
|
| 444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
|
| 446 |
# Gradio็้ข
|
| 447 |
with gr.Blocks() as demo:
|
| 448 |
with gr.Row():
|
| 449 |
-
password_input = gr.Textbox(label="
|
| 450 |
|
| 451 |
with gr.Tab("ๆน้่็"):
|
| 452 |
with gr.Row():
|
| 453 |
-
gr.Markdown("##
|
| 454 |
image_input = gr.Files(label="้ธๆๅ็", type="filepath")
|
| 455 |
-
# sheet_default_value = "https://docs.google.com/spreadsheets/d/1ygFGLxcnPad3LMVj4bZqfGZh1n2wqhs0-vOUjuVCkSY/edit#gid=0"
|
| 456 |
-
# sheet_input = gr.Textbox(label="Google Sheets URL", value=sheet_default_value)
|
| 457 |
submit_button = gr.Button("้ๅง๏ฟฝ๏ฟฝ็ๅ็")
|
| 458 |
with gr.Row():
|
| 459 |
result_text = gr.Textbox(label="่็็ตๆ")
|
| 460 |
-
with gr.Row():
|
| 461 |
download_csv_output = gr.File(label="ไธ่ฝฝ CSV")
|
|
|
|
|
|
|
| 462 |
with gr.Accordion(open=False):
|
| 463 |
with gr.Row():
|
| 464 |
result_table = gr.Dataframe(
|
|
@@ -469,16 +518,15 @@ with gr.Blocks() as demo:
|
|
| 469 |
|
| 470 |
with gr.Tab("ๅฎๅผต่็"):
|
| 471 |
with gr.Row():
|
| 472 |
-
gr.Markdown("##
|
| 473 |
-
single_image_input =
|
| 474 |
single_submit_button = gr.Button("้ๅง่็ๅ็")
|
| 475 |
with gr.Row():
|
| 476 |
single_result_text = gr.Textbox(label="่็็ตๆ")
|
|
|
|
| 477 |
with gr.Row():
|
| 478 |
single_question_image = gr.Image()
|
| 479 |
-
single_question_markdown = gr.Markdown(show_label=False, latex_delimiters=[{"left": "$", "right": "$", "display": False}])
|
| 480 |
-
with gr.Row():
|
| 481 |
-
single_download_csv_output = gr.File(label="ไธ่ฝฝ CSV")
|
| 482 |
with gr.Accordion(open=False):
|
| 483 |
with gr.Row():
|
| 484 |
single_result_table = gr.Dataframe(
|
|
@@ -494,8 +542,9 @@ with gr.Blocks() as demo:
|
|
| 494 |
pdf_submit_button = gr.Button("้ๅง่็ PDF")
|
| 495 |
with gr.Row():
|
| 496 |
pdf_result_text = gr.Textbox(label="่็็ตๆ")
|
| 497 |
-
with gr.Row():
|
| 498 |
pdf_download_csv_output = gr.File(label="ไธ่ฝฝ CSV")
|
|
|
|
|
|
|
| 499 |
with gr.Accordion(open=False):
|
| 500 |
with gr.Row():
|
| 501 |
pdf_result_table = gr.Dataframe(
|
|
@@ -503,13 +552,15 @@ with gr.Blocks() as demo:
|
|
| 503 |
column_widths=[10, 10, 5, 20, 4, 4, 4, 4, 4,4,4,4,4,4, 10],
|
| 504 |
wrap=True
|
| 505 |
)
|
| 506 |
-
# pdf_result_table = gr.Textbox(label="่็็ตๆ")
|
| 507 |
-
|
| 508 |
|
| 509 |
submit_button.click(
|
| 510 |
fn=process_image_to_data,
|
| 511 |
inputs=[password_input, image_input],
|
| 512 |
outputs=[result_table, result_text, download_csv_output]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
)
|
| 514 |
|
| 515 |
single_submit_button.click(
|
|
@@ -530,6 +581,10 @@ with gr.Blocks() as demo:
|
|
| 530 |
fn=process_pdf_to_data,
|
| 531 |
inputs=[password_input, pdf_input],
|
| 532 |
outputs=[pdf_result_table, pdf_result_text, pdf_download_csv_output]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
)
|
| 534 |
|
| 535 |
demo.launch()
|
|
|
|
| 347 |
|
| 348 |
def process_pdf_to_data(password, pdf_file):
|
| 349 |
if password != PASSWORD:
|
| 350 |
+
raise gr.Error("ๅฏ็ขผ้่ฏฏ๏ผ่ฏท้ๆฐ่พๅ
ฅ")
|
| 351 |
|
| 352 |
processed_data = []
|
| 353 |
question_count = 0
|
| 354 |
|
| 355 |
pdf_image_paths = convert_pdf_to_images(pdf_file.name)
|
| 356 |
+
image_urls = [upload_image_to_gcs(pdf_image_path, bucket) for pdf_image_path in pdf_image_paths]
|
| 357 |
+
text = pdf_image_to_text(image_urls)
|
|
|
|
|
|
|
| 358 |
|
| 359 |
+
print("======pdf_image_to_text=====")
|
| 360 |
+
print(text)
|
| 361 |
+
print("========================")
|
| 362 |
|
| 363 |
+
text = text.replace("```json", "").replace("```", "")
|
| 364 |
+
text_json = safe_json_loads(text)
|
| 365 |
+
for text_item in text_json:
|
| 366 |
+
print("======text_to_json=====")
|
| 367 |
+
print(text_item)
|
| 368 |
+
print("========================")
|
| 369 |
|
| 370 |
+
question_json = safe_json_loads(text_to_json(text_item))
|
| 371 |
+
perseus_json_str = build_perseus_json(question_json)
|
| 372 |
+
processed_data.append(["", text] + list(question_json.values()) + [perseus_json_str])
|
| 373 |
+
question_count += 1
|
| 374 |
|
| 375 |
result = f"PDF ่็ๅฎๆ๏ผ็ธฝๅ
ฑๅฎๆ {question_count} ้้ก็ฎ"
|
| 376 |
csv_file_path = create_csv(processed_data)
|
| 377 |
|
| 378 |
return processed_data, result, csv_file_path
|
| 379 |
|
| 380 |
+
def pdf_image_to_text(image_urls):
|
| 381 |
user_prompt = """
|
| 382 |
่ซ่งฃ่ฎ้ก็ฎๅ็:
|
| 383 |
- ๅ็่ซไธๅฎ่ฆ็จ zh-TW ่งฃ่ฎ
|
|
|
|
| 403 |
]
|
| 404 |
"""
|
| 405 |
|
| 406 |
+
messages=[
|
|
|
|
|
|
|
| 407 |
{
|
| 408 |
"role": "user",
|
| 409 |
"content": [
|
| 410 |
{
|
| 411 |
"type": "text",
|
| 412 |
"text": user_prompt
|
| 413 |
+
}
|
| 414 |
+
],
|
| 415 |
+
}
|
| 416 |
+
]
|
| 417 |
+
|
| 418 |
+
for image_url in image_urls:
|
| 419 |
+
messages[0]["content"].append(
|
| 420 |
{
|
| 421 |
"type": "image_url",
|
| 422 |
"image_url": {
|
| 423 |
"url": image_url,
|
| 424 |
},
|
| 425 |
+
}
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
response = OPEN_AI_CLIENT.chat.completions.create(
|
| 429 |
+
model="gpt-4o",
|
| 430 |
+
messages=messages,
|
| 431 |
max_tokens=4000,
|
| 432 |
)
|
| 433 |
return response.choices[0].message.content
|
|
|
|
| 446 |
print(f"Second JSONDecodeError: {e2}")
|
| 447 |
raise e2
|
| 448 |
|
| 449 |
+
def show_multiple_questions_markdown(data):
|
| 450 |
+
if len(data) == 0:
|
| 451 |
+
return ""
|
| 452 |
+
|
| 453 |
+
markdown = ""
|
| 454 |
+
|
| 455 |
+
for i in range(len(data)):
|
| 456 |
+
question_json = data.iloc[i].to_dict() # ็ขบไฟ่จชๅ็ๆฏ DataFrame ็็ฌฌไธ่กไธฆ่ฝๆ็บๅญๅ
ธ
|
| 457 |
+
question = question_json['้ก็ฎ']
|
| 458 |
+
choice_1 = question_json['้ธ้
1']
|
| 459 |
+
choice_2 = question_json['้ธ้
2']
|
| 460 |
+
choice_3 = question_json['้ธ้
3']
|
| 461 |
+
choice_4 = question_json['้ธ้
4']
|
| 462 |
+
answer = question_json['็ญๆก']
|
| 463 |
+
|
| 464 |
+
hints = []
|
| 465 |
+
for i in range(1, 6):
|
| 466 |
+
hint_key = question_json.get(f'ๆ็คบ{i}', None)
|
| 467 |
+
if hint_key:
|
| 468 |
+
hints.append(hint_key)
|
| 469 |
+
else:
|
| 470 |
+
break
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
markdown += f"""
|
| 474 |
+
|
| 475 |
+
---
|
| 476 |
+
|
| 477 |
+
## ้ก็ฎ
|
| 478 |
+
- {question}
|
| 479 |
+
|
| 480 |
+
## ้ธ้
|
| 481 |
+
1. {choice_1}
|
| 482 |
+
2. {choice_2}
|
| 483 |
+
3. {choice_3}
|
| 484 |
+
4. {choice_4}
|
| 485 |
+
|
| 486 |
+
## ็ญๆก: {answer}
|
| 487 |
+
|
| 488 |
+
## ๆ็คบ
|
| 489 |
+
|
| 490 |
+
"""
|
| 491 |
+
for i, hint in enumerate(hints):
|
| 492 |
+
markdown += f"{i+1}. {hint}\n"
|
| 493 |
+
|
| 494 |
+
return markdown
|
| 495 |
|
| 496 |
# Gradio็้ข
|
| 497 |
with gr.Blocks() as demo:
|
| 498 |
with gr.Row():
|
| 499 |
+
password_input = gr.Textbox(label="ๅฏ็ขผ", type="password")
|
| 500 |
|
| 501 |
with gr.Tab("ๆน้่็"):
|
| 502 |
with gr.Row():
|
| 503 |
+
gr.Markdown("## ๆน้ๅ็่็ + Perseus JSON ็ๆ")
|
| 504 |
image_input = gr.Files(label="้ธๆๅ็", type="filepath")
|
|
|
|
|
|
|
| 505 |
submit_button = gr.Button("้ๅง๏ฟฝ๏ฟฝ็ๅ็")
|
| 506 |
with gr.Row():
|
| 507 |
result_text = gr.Textbox(label="่็็ตๆ")
|
|
|
|
| 508 |
download_csv_output = gr.File(label="ไธ่ฝฝ CSV")
|
| 509 |
+
with gr.Row():
|
| 510 |
+
batch_question_markdown = gr.Markdown(show_label=False, latex_delimiters=[{"left": "$", "right": "$", "display": False}])
|
| 511 |
with gr.Accordion(open=False):
|
| 512 |
with gr.Row():
|
| 513 |
result_table = gr.Dataframe(
|
|
|
|
| 518 |
|
| 519 |
with gr.Tab("ๅฎๅผต่็"):
|
| 520 |
with gr.Row():
|
| 521 |
+
gr.Markdown("## ๅฎๅผตๅ็่็")
|
| 522 |
+
single_image_input = gr.Files(label="้ธๆๅ็", type="filepath")
|
| 523 |
single_submit_button = gr.Button("้ๅง่็ๅ็")
|
| 524 |
with gr.Row():
|
| 525 |
single_result_text = gr.Textbox(label="่็็ตๆ")
|
| 526 |
+
single_download_csv_output = gr.File(label="ไธ่ฝฝ CSV")
|
| 527 |
with gr.Row():
|
| 528 |
single_question_image = gr.Image()
|
| 529 |
+
single_question_markdown = gr.Markdown(show_label=False, latex_delimiters=[{"left": "$", "right": "$", "display": False}])
|
|
|
|
|
|
|
| 530 |
with gr.Accordion(open=False):
|
| 531 |
with gr.Row():
|
| 532 |
single_result_table = gr.Dataframe(
|
|
|
|
| 542 |
pdf_submit_button = gr.Button("้ๅง่็ PDF")
|
| 543 |
with gr.Row():
|
| 544 |
pdf_result_text = gr.Textbox(label="่็็ตๆ")
|
|
|
|
| 545 |
pdf_download_csv_output = gr.File(label="ไธ่ฝฝ CSV")
|
| 546 |
+
with gr.Row():
|
| 547 |
+
pdf_question_markdown = gr.Markdown(show_label=False, latex_delimiters=[{"left": "$", "right": "$", "display": False}])
|
| 548 |
with gr.Accordion(open=False):
|
| 549 |
with gr.Row():
|
| 550 |
pdf_result_table = gr.Dataframe(
|
|
|
|
| 552 |
column_widths=[10, 10, 5, 20, 4, 4, 4, 4, 4,4,4,4,4,4, 10],
|
| 553 |
wrap=True
|
| 554 |
)
|
|
|
|
|
|
|
| 555 |
|
| 556 |
submit_button.click(
|
| 557 |
fn=process_image_to_data,
|
| 558 |
inputs=[password_input, image_input],
|
| 559 |
outputs=[result_table, result_text, download_csv_output]
|
| 560 |
+
).then(
|
| 561 |
+
fn=show_multiple_questions_markdown,
|
| 562 |
+
inputs=[result_table],
|
| 563 |
+
outputs=[batch_question_markdown]
|
| 564 |
)
|
| 565 |
|
| 566 |
single_submit_button.click(
|
|
|
|
| 581 |
fn=process_pdf_to_data,
|
| 582 |
inputs=[password_input, pdf_input],
|
| 583 |
outputs=[pdf_result_table, pdf_result_text, pdf_download_csv_output]
|
| 584 |
+
).then(
|
| 585 |
+
fn=show_multiple_questions_markdown,
|
| 586 |
+
inputs=[pdf_result_table],
|
| 587 |
+
outputs=[pdf_question_markdown]
|
| 588 |
)
|
| 589 |
|
| 590 |
demo.launch()
|