jb100 commited on
Commit
5e4d3c1
·
verified ·
1 Parent(s): 7f47db7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -6
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # code v13
2
 
3
  import gradio as gr
4
  import torch
@@ -680,11 +680,19 @@ def extract_text_from_docx(file_path: str) -> Tuple[str, list]:
680
  if text:
681
  paragraphs.append(text)
682
 
 
683
  para_format = {
684
  'alignment': para.alignment,
 
 
 
 
 
 
685
  'runs': []
686
  }
687
 
 
688
  for run in para.runs:
689
  if run.text.strip():
690
  run_format = {
@@ -693,8 +701,50 @@ def extract_text_from_docx(file_path: str) -> Tuple[str, list]:
693
  'italic': run.italic,
694
  'underline': run.underline,
695
  'font_name': run.font.name,
696
- 'font_size': run.font.size
 
 
 
 
 
 
 
 
 
697
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  para_format['runs'].append(run_format)
699
 
700
  formatting_info.append(para_format)
@@ -722,38 +772,110 @@ def create_formatted_docx(translated_paragraphs: list, formatting_info: list, fi
722
 
723
  paragraph = doc.add_paragraph()
724
 
725
- # Apply paragraph formatting
726
  try:
727
  if para_format.get('alignment') is not None:
728
  paragraph.alignment = para_format['alignment']
 
 
 
 
 
 
 
 
 
 
 
 
729
  except Exception as e:
730
  logger.warning(f"Could not apply paragraph formatting: {e}")
731
 
732
- # Apply run formatting
733
  runs_info = para_format.get('runs', [])
734
 
735
  if runs_info:
736
- # Get dominant formatting
737
  total_runs = len(runs_info)
 
 
738
  bold_count = sum(1 for r in runs_info if r.get('bold'))
739
  italic_count = sum(1 for r in runs_info if r.get('italic'))
740
  underline_count = sum(1 for r in runs_info if r.get('underline'))
741
 
 
 
 
 
 
 
 
 
742
  run = paragraph.add_run(para_text)
743
 
744
  try:
 
745
  if bold_count > total_runs / 2:
746
  run.bold = True
747
  if italic_count > total_runs / 2:
748
  run.italic = True
749
  if underline_count > total_runs / 2:
750
  run.underline = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  except Exception as e:
752
- logger.warning(f"Could not apply run formatting: {e}")
753
  else:
 
754
  paragraph.add_run(para_text)
755
 
756
  doc.save(filename)
 
757
  return filename
758
 
759
  except Exception as e:
 
1
+ # Code v15
2
 
3
  import gradio as gr
4
  import torch
 
680
  if text:
681
  paragraphs.append(text)
682
 
683
+ # Store comprehensive paragraph formatting
684
  para_format = {
685
  'alignment': para.alignment,
686
+ 'left_indent': para.paragraph_format.left_indent,
687
+ 'right_indent': para.paragraph_format.right_indent,
688
+ 'first_line_indent': para.paragraph_format.first_line_indent,
689
+ 'space_before': para.paragraph_format.space_before,
690
+ 'space_after': para.paragraph_format.space_after,
691
+ 'line_spacing': para.paragraph_format.line_spacing,
692
  'runs': []
693
  }
694
 
695
+ # Store detailed run-level formatting
696
  for run in para.runs:
697
  if run.text.strip():
698
  run_format = {
 
701
  'italic': run.italic,
702
  'underline': run.underline,
703
  'font_name': run.font.name,
704
+ 'font_size': run.font.size,
705
+ 'font_color_rgb': None,
706
+ 'font_color_theme': None,
707
+ 'highlight_color': None,
708
+ 'superscript': None,
709
+ 'subscript': None,
710
+ 'strike': None,
711
+ 'double_strike': None,
712
+ 'all_caps': None,
713
+ 'small_caps': None
714
  }
715
+
716
+ # Get font color (RGB)
717
+ try:
718
+ if run.font.color and run.font.color.rgb:
719
+ run_format['font_color_rgb'] = run.font.color.rgb
720
+ except:
721
+ pass
722
+
723
+ # Get font color (theme color)
724
+ try:
725
+ if run.font.color and run.font.color.theme_color:
726
+ run_format['font_color_theme'] = run.font.color.theme_color
727
+ except:
728
+ pass
729
+
730
+ # Get highlight color
731
+ try:
732
+ if run.font.highlight_color:
733
+ run_format['highlight_color'] = run.font.highlight_color
734
+ except:
735
+ pass
736
+
737
+ # Get additional formatting
738
+ try:
739
+ run_format['superscript'] = run.font.superscript
740
+ run_format['subscript'] = run.font.subscript
741
+ run_format['strike'] = run.font.strike
742
+ run_format['double_strike'] = run.font.double_strike
743
+ run_format['all_caps'] = run.font.all_caps
744
+ run_format['small_caps'] = run.font.small_caps
745
+ except:
746
+ pass
747
+
748
  para_format['runs'].append(run_format)
749
 
750
  formatting_info.append(para_format)
 
772
 
773
  paragraph = doc.add_paragraph()
774
 
775
+ # Apply paragraph-level formatting
776
  try:
777
  if para_format.get('alignment') is not None:
778
  paragraph.alignment = para_format['alignment']
779
+ if para_format.get('left_indent') is not None:
780
+ paragraph.paragraph_format.left_indent = para_format['left_indent']
781
+ if para_format.get('right_indent') is not None:
782
+ paragraph.paragraph_format.right_indent = para_format['right_indent']
783
+ if para_format.get('first_line_indent') is not None:
784
+ paragraph.paragraph_format.first_line_indent = para_format['first_line_indent']
785
+ if para_format.get('space_before') is not None:
786
+ paragraph.paragraph_format.space_before = para_format['space_before']
787
+ if para_format.get('space_after') is not None:
788
+ paragraph.paragraph_format.space_after = para_format['space_after']
789
+ if para_format.get('line_spacing') is not None:
790
+ paragraph.paragraph_format.line_spacing = para_format['line_spacing']
791
  except Exception as e:
792
  logger.warning(f"Could not apply paragraph formatting: {e}")
793
 
794
+ # Apply run-level formatting with full preservation
795
  runs_info = para_format.get('runs', [])
796
 
797
  if runs_info:
798
+ # Analyze the dominant formatting for the paragraph
799
  total_runs = len(runs_info)
800
+
801
+ # Count formatting occurrences
802
  bold_count = sum(1 for r in runs_info if r.get('bold'))
803
  italic_count = sum(1 for r in runs_info if r.get('italic'))
804
  underline_count = sum(1 for r in runs_info if r.get('underline'))
805
 
806
+ # Get most common formatting values
807
+ font_names = [r.get('font_name') for r in runs_info if r.get('font_name')]
808
+ font_sizes = [r.get('font_size') for r in runs_info if r.get('font_size')]
809
+ font_colors_rgb = [r.get('font_color_rgb') for r in runs_info if r.get('font_color_rgb')]
810
+ font_colors_theme = [r.get('font_color_theme') for r in runs_info if r.get('font_color_theme')]
811
+ highlight_colors = [r.get('highlight_color') for r in runs_info if r.get('highlight_color')]
812
+
813
+ # Create run with translated text
814
  run = paragraph.add_run(para_text)
815
 
816
  try:
817
+ # Apply basic formatting (use majority rule)
818
  if bold_count > total_runs / 2:
819
  run.bold = True
820
  if italic_count > total_runs / 2:
821
  run.italic = True
822
  if underline_count > total_runs / 2:
823
  run.underline = True
824
+
825
+ # Apply font name (most common)
826
+ if font_names:
827
+ most_common_font = max(set(font_names), key=font_names.count)
828
+ run.font.name = most_common_font
829
+
830
+ # Apply font size (most common)
831
+ if font_sizes:
832
+ most_common_size = max(set(font_sizes), key=font_sizes.count)
833
+ run.font.size = most_common_size
834
+
835
+ # Apply font color (RGB - most common)
836
+ if font_colors_rgb:
837
+ most_common_color = max(set(font_colors_rgb), key=font_colors_rgb.count)
838
+ run.font.color.rgb = most_common_color
839
+
840
+ # Apply font color (theme - most common)
841
+ elif font_colors_theme:
842
+ most_common_theme = max(set(font_colors_theme), key=font_colors_theme.count)
843
+ run.font.color.theme_color = most_common_theme
844
+
845
+ # Apply highlight color (most common)
846
+ if highlight_colors:
847
+ most_common_highlight = max(set(highlight_colors), key=highlight_colors.count)
848
+ run.font.highlight_color = most_common_highlight
849
+
850
+ # Apply additional formatting if majority of runs have it
851
+ superscript_count = sum(1 for r in runs_info if r.get('superscript'))
852
+ subscript_count = sum(1 for r in runs_info if r.get('subscript'))
853
+ strike_count = sum(1 for r in runs_info if r.get('strike'))
854
+ double_strike_count = sum(1 for r in runs_info if r.get('double_strike'))
855
+ all_caps_count = sum(1 for r in runs_info if r.get('all_caps'))
856
+ small_caps_count = sum(1 for r in runs_info if r.get('small_caps'))
857
+
858
+ if superscript_count > total_runs / 2:
859
+ run.font.superscript = True
860
+ if subscript_count > total_runs / 2:
861
+ run.font.subscript = True
862
+ if strike_count > total_runs / 2:
863
+ run.font.strike = True
864
+ if double_strike_count > total_runs / 2:
865
+ run.font.double_strike = True
866
+ if all_caps_count > total_runs / 2:
867
+ run.font.all_caps = True
868
+ if small_caps_count > total_runs / 2:
869
+ run.font.small_caps = True
870
+
871
  except Exception as e:
872
+ logger.warning(f"Could not apply some run formatting: {e}")
873
  else:
874
+ # No run formatting info, just add the text
875
  paragraph.add_run(para_text)
876
 
877
  doc.save(filename)
878
+ logger.info(f"Created formatted DOCX with full formatting preservation: {filename}")
879
  return filename
880
 
881
  except Exception as e: