import gradio as gr from extractor import extract_text from cleaner import clean_text from formatter import format_text def process_taxdoc(file): raw_text = extract_text(file) cleaned = clean_text(raw_text) formatted = format_text(cleaned) output_path = "processed_" + file.name.split("/")[-1].replace(".pdf", ".txt").replace(".docx", ".txt") with open(output_path, "w", encoding="utf-8") as f: f.write(formatted) return formatted, output_path iface = gr.Interface( fn=process_taxdoc, inputs=gr.File(file_types=[".pdf", ".txt"]), outputs=[gr.Textbox(label="Formatted Text", lines=20), gr.File(label="Download File")], title="📄 TaxDoc Preprocessor for RAG", description="Upload Indian tax PDFs or TXT files to clean and structure them for better RAG retrieval.", ) if __name__ == "__main__": iface.launch()