iamnew123's picture
Upload 5 files
7780d69 verified
raw
history blame contribute delete
870 Bytes
import gradio as gr
from extractor import extract_text
from cleaner import clean_text
from formatter import format_text
def process_taxdoc(file):
raw_text = extract_text(file)
cleaned = clean_text(raw_text)
formatted = format_text(cleaned)
output_path = "processed_" + file.name.split("/")[-1].replace(".pdf", ".txt").replace(".docx", ".txt")
with open(output_path, "w", encoding="utf-8") as f:
f.write(formatted)
return formatted, output_path
iface = gr.Interface(
fn=process_taxdoc,
inputs=gr.File(file_types=[".pdf", ".txt"]),
outputs=[gr.Textbox(label="Formatted Text", lines=20), gr.File(label="Download File")],
title="๐Ÿ“„ TaxDoc Preprocessor for RAG",
description="Upload Indian tax PDFs or TXT files to clean and structure them for better RAG retrieval.",
)
if __name__ == "__main__":
iface.launch()