Spaces:
Running
Running
| # html2txt.py | |
| from bs4 import BeautifulSoup | |
| import sys | |
| import aiofiles | |
| from tqdm.asyncio import tqdm | |
| async def convert_html_to_txt(html_file_path: str, output_txt_path: str) -> bool: | |
| try: | |
| async with aiofiles.open(html_file_path, 'r', encoding='utf-8') as f: | |
| html_from_file = await f.read() | |
| except FileNotFoundError: | |
| tqdm.write(f"[!] Error: Intermediate HTML file not found '{html_file_path}'.", file=sys.stderr) | |
| return False | |
| except Exception as e: | |
| tqdm.write(f"[!] Error reading HTML file: {e}", file=sys.stderr) | |
| return False | |
| soup = BeautifulSoup(html_from_file, "lxml") | |
| paragraphs = soup.find_all('p') | |
| extracted_lines = [p.get_text(separator=" ", strip=True) for p in paragraphs if p.get_text(strip=True)] | |
| tqdm.write(f"[*] Text extraction complete, found {len(extracted_lines)} valid lines of text.") | |
| try: | |
| full_text_content = "\n".join(extracted_lines) | |
| async with aiofiles.open(output_txt_path, 'w', encoding='utf-8') as f: | |
| await f.write(full_text_content) | |
| return True | |
| except Exception as e: | |
| tqdm.write(f"[!] Error writing to TXT file: {e}", file=sys.stderr) | |
| return False | |