Spaces:

Pavan178
/

txt-pdf

Paused

App Files Files Community

Pavan178 commited on Jul 29, 2024

Commit

94cefaf

verified ·

1 Parent(s): 18621da

Create app.py

Browse files

Files changed (1) hide show

app.py +113 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+import pdfplumber
+import re
+import tempfile
+import os
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from concurrent.futures import ThreadPoolExecutor
+import spaces
+@spaces.GPU
+def preprocess_text_for_tts(text):
+    text = re.sub(r'[^\x20-\x7E]', ' ', text)
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+    text = re.sub(r'\S+@\S+', '', text)
+    text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
+    text = re.sub(r'\.{2,}', ' ', text)
+    def convert_case(match):
+        word = match.group(0)
+        common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'}
+        return word if word in common_abbreviations else word.title()
+    text = re.sub(r'\b[A-Z]+\b', convert_case, text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\.([A-Za-z])', r'. \1', text)
+    text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)
+    text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text)
+    text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# Check if CUDA (GPU) is available
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Load the model and tokenizer
+model_name = "sherif31/T5-Grammer-Correction"  # Replace with your actual model name
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+def correct_text(text):
+    # Split the text into chunks to avoid exceeding max token limit
+    max_chunk_length = 512
+    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
+    corrected_chunks = []
+    for chunk in chunks:
+        input_text = f"grammar: {chunk}"
+        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
+        with torch.no_grad():
+            output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5)
+        corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True)
+        corrected_chunks.append(corrected_chunk)
+    return ' '.join(corrected_chunks)
+def extract_text_from_pages(pdf_bytes):
+    page_text_dict = {}
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
+        temp_pdf.write(pdf_bytes)
+        temp_pdf_path = temp_pdf.name
+    try:
+        with pdfplumber.open(temp_pdf_path) as pdf:
+            for page_num, page in enumerate(pdf.pages, 1):
+                raw_text = page.extract_text()
+                if raw_text:
+                    cleaned_text = preprocess_text_for_tts(raw_text)
+                    corrected_text = correct_text(cleaned_text)
+                    page_text_dict[page_num] = corrected_text
+                else:
+                    page_text_dict[page_num] = ""
+    finally:
+        os.unlink(temp_pdf_path)
+    return page_text_dict
+def process_pdf(pdf_file):
+    if pdf_file is None:
+        return "No file uploaded. Please upload a PDF file."
+    result = extract_text_from_pages(pdf_file)
+    # Use ThreadPoolExecutor for parallel processing
+    with ThreadPoolExecutor() as executor:
+        corrected_texts = list(executor.map(correct_text, result.values()))
+    # Combine the results
+    output = ""
+    for page_num, text in zip(result.keys(), corrected_texts):
+        output += f"Page {page_num}:\n{text}\n\n"
+    return output
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=gr.File(label="Upload PDF", type="binary"),
+    outputs=gr.Textbox(label="Extracted and Processed Text"),
+    title="PDF Text Extractor and Processor",
+    description="Upload a PDF file to extract, clean, and correct its text content."
+)
+# Launch the app
+iface.launch()