Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from extractor import extract_text | |
| from cleaner import clean_text | |
| from formatter import format_text | |
| def process_taxdoc(file): | |
| raw_text = extract_text(file) | |
| cleaned = clean_text(raw_text) | |
| formatted = format_text(cleaned) | |
| output_path = "processed_" + file.name.split("/")[-1].replace(".pdf", ".txt").replace(".docx", ".txt") | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(formatted) | |
| return formatted, output_path | |
| iface = gr.Interface( | |
| fn=process_taxdoc, | |
| inputs=gr.File(file_types=[".pdf", ".txt"]), | |
| outputs=[gr.Textbox(label="Formatted Text", lines=20), gr.File(label="Download File")], | |
| title="๐ TaxDoc Preprocessor for RAG", | |
| description="Upload Indian tax PDFs or TXT files to clean and structure them for better RAG retrieval.", | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |