Spaces:
Running
Running
| """ | |
| Convert source documents (PDF, HTML) to Markdown using MarkItDown. | |
| Saves converted files to sources/converted/. | |
| """ | |
| import os | |
| import sys | |
| from markitdown import MarkItDown | |
| RAW_DIR = os.path.join(os.path.dirname(__file__), "..", "sources", "raw") | |
| CONVERTED_DIR = os.path.join(os.path.dirname(__file__), "..", "sources", "converted") | |
| os.makedirs(CONVERTED_DIR, exist_ok=True) | |
| md = MarkItDown() | |
| # List all files to convert | |
| files = sorted(os.listdir(RAW_DIR)) | |
| print(f"Found {len(files)} files in {RAW_DIR}\n") | |
| for filename in files: | |
| filepath = os.path.join(RAW_DIR, filename) | |
| if not os.path.isfile(filepath): | |
| continue | |
| base_name = os.path.splitext(filename)[0] | |
| output_path = os.path.join(CONVERTED_DIR, f"{base_name}.md") | |
| print(f"Converting: {filename}") | |
| try: | |
| result = md.convert(filepath) | |
| text = result.text_content | |
| # Write to file | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(text) | |
| # Stats | |
| lines = text.count("\n") + 1 | |
| size_kb = len(text.encode("utf-8")) / 1024 | |
| print(f" -> {output_path}") | |
| print(f" {lines} lines, {size_kb:.1f} KB\n") | |
| except Exception as e: | |
| print(f" FAILED: {e}\n") | |