Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
import torch
|
|
@@ -680,11 +680,19 @@ def extract_text_from_docx(file_path: str) -> Tuple[str, list]:
|
|
| 680 |
if text:
|
| 681 |
paragraphs.append(text)
|
| 682 |
|
|
|
|
| 683 |
para_format = {
|
| 684 |
'alignment': para.alignment,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
'runs': []
|
| 686 |
}
|
| 687 |
|
|
|
|
| 688 |
for run in para.runs:
|
| 689 |
if run.text.strip():
|
| 690 |
run_format = {
|
|
@@ -693,8 +701,50 @@ def extract_text_from_docx(file_path: str) -> Tuple[str, list]:
|
|
| 693 |
'italic': run.italic,
|
| 694 |
'underline': run.underline,
|
| 695 |
'font_name': run.font.name,
|
| 696 |
-
'font_size': run.font.size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
para_format['runs'].append(run_format)
|
| 699 |
|
| 700 |
formatting_info.append(para_format)
|
|
@@ -722,38 +772,110 @@ def create_formatted_docx(translated_paragraphs: list, formatting_info: list, fi
|
|
| 722 |
|
| 723 |
paragraph = doc.add_paragraph()
|
| 724 |
|
| 725 |
-
# Apply paragraph formatting
|
| 726 |
try:
|
| 727 |
if para_format.get('alignment') is not None:
|
| 728 |
paragraph.alignment = para_format['alignment']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 729 |
except Exception as e:
|
| 730 |
logger.warning(f"Could not apply paragraph formatting: {e}")
|
| 731 |
|
| 732 |
-
# Apply run formatting
|
| 733 |
runs_info = para_format.get('runs', [])
|
| 734 |
|
| 735 |
if runs_info:
|
| 736 |
-
#
|
| 737 |
total_runs = len(runs_info)
|
|
|
|
|
|
|
| 738 |
bold_count = sum(1 for r in runs_info if r.get('bold'))
|
| 739 |
italic_count = sum(1 for r in runs_info if r.get('italic'))
|
| 740 |
underline_count = sum(1 for r in runs_info if r.get('underline'))
|
| 741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
run = paragraph.add_run(para_text)
|
| 743 |
|
| 744 |
try:
|
|
|
|
| 745 |
if bold_count > total_runs / 2:
|
| 746 |
run.bold = True
|
| 747 |
if italic_count > total_runs / 2:
|
| 748 |
run.italic = True
|
| 749 |
if underline_count > total_runs / 2:
|
| 750 |
run.underline = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
except Exception as e:
|
| 752 |
-
logger.warning(f"Could not apply run formatting: {e}")
|
| 753 |
else:
|
|
|
|
| 754 |
paragraph.add_run(para_text)
|
| 755 |
|
| 756 |
doc.save(filename)
|
|
|
|
| 757 |
return filename
|
| 758 |
|
| 759 |
except Exception as e:
|
|
|
|
| 1 |
+
# Code v15
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
import torch
|
|
|
|
| 680 |
if text:
|
| 681 |
paragraphs.append(text)
|
| 682 |
|
| 683 |
+
# Store comprehensive paragraph formatting
|
| 684 |
para_format = {
|
| 685 |
'alignment': para.alignment,
|
| 686 |
+
'left_indent': para.paragraph_format.left_indent,
|
| 687 |
+
'right_indent': para.paragraph_format.right_indent,
|
| 688 |
+
'first_line_indent': para.paragraph_format.first_line_indent,
|
| 689 |
+
'space_before': para.paragraph_format.space_before,
|
| 690 |
+
'space_after': para.paragraph_format.space_after,
|
| 691 |
+
'line_spacing': para.paragraph_format.line_spacing,
|
| 692 |
'runs': []
|
| 693 |
}
|
| 694 |
|
| 695 |
+
# Store detailed run-level formatting
|
| 696 |
for run in para.runs:
|
| 697 |
if run.text.strip():
|
| 698 |
run_format = {
|
|
|
|
| 701 |
'italic': run.italic,
|
| 702 |
'underline': run.underline,
|
| 703 |
'font_name': run.font.name,
|
| 704 |
+
'font_size': run.font.size,
|
| 705 |
+
'font_color_rgb': None,
|
| 706 |
+
'font_color_theme': None,
|
| 707 |
+
'highlight_color': None,
|
| 708 |
+
'superscript': None,
|
| 709 |
+
'subscript': None,
|
| 710 |
+
'strike': None,
|
| 711 |
+
'double_strike': None,
|
| 712 |
+
'all_caps': None,
|
| 713 |
+
'small_caps': None
|
| 714 |
}
|
| 715 |
+
|
| 716 |
+
# Get font color (RGB)
|
| 717 |
+
try:
|
| 718 |
+
if run.font.color and run.font.color.rgb:
|
| 719 |
+
run_format['font_color_rgb'] = run.font.color.rgb
|
| 720 |
+
except:
|
| 721 |
+
pass
|
| 722 |
+
|
| 723 |
+
# Get font color (theme color)
|
| 724 |
+
try:
|
| 725 |
+
if run.font.color and run.font.color.theme_color:
|
| 726 |
+
run_format['font_color_theme'] = run.font.color.theme_color
|
| 727 |
+
except:
|
| 728 |
+
pass
|
| 729 |
+
|
| 730 |
+
# Get highlight color
|
| 731 |
+
try:
|
| 732 |
+
if run.font.highlight_color:
|
| 733 |
+
run_format['highlight_color'] = run.font.highlight_color
|
| 734 |
+
except:
|
| 735 |
+
pass
|
| 736 |
+
|
| 737 |
+
# Get additional formatting
|
| 738 |
+
try:
|
| 739 |
+
run_format['superscript'] = run.font.superscript
|
| 740 |
+
run_format['subscript'] = run.font.subscript
|
| 741 |
+
run_format['strike'] = run.font.strike
|
| 742 |
+
run_format['double_strike'] = run.font.double_strike
|
| 743 |
+
run_format['all_caps'] = run.font.all_caps
|
| 744 |
+
run_format['small_caps'] = run.font.small_caps
|
| 745 |
+
except:
|
| 746 |
+
pass
|
| 747 |
+
|
| 748 |
para_format['runs'].append(run_format)
|
| 749 |
|
| 750 |
formatting_info.append(para_format)
|
|
|
|
| 772 |
|
| 773 |
paragraph = doc.add_paragraph()
|
| 774 |
|
| 775 |
+
# Apply paragraph-level formatting
|
| 776 |
try:
|
| 777 |
if para_format.get('alignment') is not None:
|
| 778 |
paragraph.alignment = para_format['alignment']
|
| 779 |
+
if para_format.get('left_indent') is not None:
|
| 780 |
+
paragraph.paragraph_format.left_indent = para_format['left_indent']
|
| 781 |
+
if para_format.get('right_indent') is not None:
|
| 782 |
+
paragraph.paragraph_format.right_indent = para_format['right_indent']
|
| 783 |
+
if para_format.get('first_line_indent') is not None:
|
| 784 |
+
paragraph.paragraph_format.first_line_indent = para_format['first_line_indent']
|
| 785 |
+
if para_format.get('space_before') is not None:
|
| 786 |
+
paragraph.paragraph_format.space_before = para_format['space_before']
|
| 787 |
+
if para_format.get('space_after') is not None:
|
| 788 |
+
paragraph.paragraph_format.space_after = para_format['space_after']
|
| 789 |
+
if para_format.get('line_spacing') is not None:
|
| 790 |
+
paragraph.paragraph_format.line_spacing = para_format['line_spacing']
|
| 791 |
except Exception as e:
|
| 792 |
logger.warning(f"Could not apply paragraph formatting: {e}")
|
| 793 |
|
| 794 |
+
# Apply run-level formatting with full preservation
|
| 795 |
runs_info = para_format.get('runs', [])
|
| 796 |
|
| 797 |
if runs_info:
|
| 798 |
+
# Analyze the dominant formatting for the paragraph
|
| 799 |
total_runs = len(runs_info)
|
| 800 |
+
|
| 801 |
+
# Count formatting occurrences
|
| 802 |
bold_count = sum(1 for r in runs_info if r.get('bold'))
|
| 803 |
italic_count = sum(1 for r in runs_info if r.get('italic'))
|
| 804 |
underline_count = sum(1 for r in runs_info if r.get('underline'))
|
| 805 |
|
| 806 |
+
# Get most common formatting values
|
| 807 |
+
font_names = [r.get('font_name') for r in runs_info if r.get('font_name')]
|
| 808 |
+
font_sizes = [r.get('font_size') for r in runs_info if r.get('font_size')]
|
| 809 |
+
font_colors_rgb = [r.get('font_color_rgb') for r in runs_info if r.get('font_color_rgb')]
|
| 810 |
+
font_colors_theme = [r.get('font_color_theme') for r in runs_info if r.get('font_color_theme')]
|
| 811 |
+
highlight_colors = [r.get('highlight_color') for r in runs_info if r.get('highlight_color')]
|
| 812 |
+
|
| 813 |
+
# Create run with translated text
|
| 814 |
run = paragraph.add_run(para_text)
|
| 815 |
|
| 816 |
try:
|
| 817 |
+
# Apply basic formatting (use majority rule)
|
| 818 |
if bold_count > total_runs / 2:
|
| 819 |
run.bold = True
|
| 820 |
if italic_count > total_runs / 2:
|
| 821 |
run.italic = True
|
| 822 |
if underline_count > total_runs / 2:
|
| 823 |
run.underline = True
|
| 824 |
+
|
| 825 |
+
# Apply font name (most common)
|
| 826 |
+
if font_names:
|
| 827 |
+
most_common_font = max(set(font_names), key=font_names.count)
|
| 828 |
+
run.font.name = most_common_font
|
| 829 |
+
|
| 830 |
+
# Apply font size (most common)
|
| 831 |
+
if font_sizes:
|
| 832 |
+
most_common_size = max(set(font_sizes), key=font_sizes.count)
|
| 833 |
+
run.font.size = most_common_size
|
| 834 |
+
|
| 835 |
+
# Apply font color (RGB - most common)
|
| 836 |
+
if font_colors_rgb:
|
| 837 |
+
most_common_color = max(set(font_colors_rgb), key=font_colors_rgb.count)
|
| 838 |
+
run.font.color.rgb = most_common_color
|
| 839 |
+
|
| 840 |
+
# Apply font color (theme - most common)
|
| 841 |
+
elif font_colors_theme:
|
| 842 |
+
most_common_theme = max(set(font_colors_theme), key=font_colors_theme.count)
|
| 843 |
+
run.font.color.theme_color = most_common_theme
|
| 844 |
+
|
| 845 |
+
# Apply highlight color (most common)
|
| 846 |
+
if highlight_colors:
|
| 847 |
+
most_common_highlight = max(set(highlight_colors), key=highlight_colors.count)
|
| 848 |
+
run.font.highlight_color = most_common_highlight
|
| 849 |
+
|
| 850 |
+
# Apply additional formatting if majority of runs have it
|
| 851 |
+
superscript_count = sum(1 for r in runs_info if r.get('superscript'))
|
| 852 |
+
subscript_count = sum(1 for r in runs_info if r.get('subscript'))
|
| 853 |
+
strike_count = sum(1 for r in runs_info if r.get('strike'))
|
| 854 |
+
double_strike_count = sum(1 for r in runs_info if r.get('double_strike'))
|
| 855 |
+
all_caps_count = sum(1 for r in runs_info if r.get('all_caps'))
|
| 856 |
+
small_caps_count = sum(1 for r in runs_info if r.get('small_caps'))
|
| 857 |
+
|
| 858 |
+
if superscript_count > total_runs / 2:
|
| 859 |
+
run.font.superscript = True
|
| 860 |
+
if subscript_count > total_runs / 2:
|
| 861 |
+
run.font.subscript = True
|
| 862 |
+
if strike_count > total_runs / 2:
|
| 863 |
+
run.font.strike = True
|
| 864 |
+
if double_strike_count > total_runs / 2:
|
| 865 |
+
run.font.double_strike = True
|
| 866 |
+
if all_caps_count > total_runs / 2:
|
| 867 |
+
run.font.all_caps = True
|
| 868 |
+
if small_caps_count > total_runs / 2:
|
| 869 |
+
run.font.small_caps = True
|
| 870 |
+
|
| 871 |
except Exception as e:
|
| 872 |
+
logger.warning(f"Could not apply some run formatting: {e}")
|
| 873 |
else:
|
| 874 |
+
# No run formatting info, just add the text
|
| 875 |
paragraph.add_run(para_text)
|
| 876 |
|
| 877 |
doc.save(filename)
|
| 878 |
+
logger.info(f"Created formatted DOCX with full formatting preservation: {filename}")
|
| 879 |
return filename
|
| 880 |
|
| 881 |
except Exception as e:
|