Spaces:
Build error
Build error
| import os | |
| import re | |
| from pathlib import Path | |
| from bs4 import BeautifulSoup | |
| def process_html_file(file_path, output_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| soup = BeautifulSoup(content, 'html.parser') | |
| # Find the Statement line | |
| statement_tag = soup.find('h3', string=re.compile(r'^Statement:')) | |
| if statement_tag: | |
| statement_text = statement_tag.string | |
| # Remove "in the table:" and everything after it | |
| new_statement = re.sub(r'\s*in the table:.*$', '', statement_text, flags=re.DOTALL) | |
| statement_tag.string.replace_with(new_statement) | |
| # Write the modified content | |
| with open(output_path, 'w', encoding='utf-8') as file: | |
| file.write(str(soup)) | |
| def process_directory(input_dir, output_dir): | |
| for root, dirs, files in os.walk(input_dir): | |
| for file in files: | |
| if file.endswith('.html'): | |
| input_path = Path(root) / file | |
| relative_path = input_path.relative_to(input_dir) | |
| output_path = Path(output_dir) / relative_path | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| process_html_file(input_path, output_path) | |
| # Define input and output directories | |
| input_directory = "htmls_DATER_mod" | |
| output_directory = "htmls_DATER_mod2" | |
| # Process the files | |
| process_directory(input_directory, output_directory) | |
| print("Processing complete. Modified files are in the output directory.") |