OCRonos-TextCorrect

Sleeping

App Files Files Community

Pclanglais commited on Aug 4, 2024

Commit

ffbf266

verified ·

1 Parent(s): 2814dfb

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -43

app.py CHANGED Viewed

@@ -1,24 +1,36 @@
 import spaces
 import transformers
 import re
 import torch
 import gradio as gr
 import os
-import ctranslate2
-import difflib
 import shutil
 import requests
 from concurrent.futures import ThreadPoolExecutor
 # Define the device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load CTranslate2 model and tokenizer
-model_path = "ocronos_ct2"
-generator = ctranslate2.Generator(model_path, device=device)
-tokenizer = transformers.AutoTokenizer.from_pretrained("PleIAs/OCRonos-Vintage")
-# CSS for formatting (unchanged)
 # CSS for formatting
 css = """
 <style>
@@ -117,41 +129,73 @@ def preprocess_text(text):
     return text.strip()
 def split_text(text, max_tokens=500):
-    encoded = tokenizer.encode(text)
-    splits = []
-    for i in range(0, len(encoded), max_tokens):
-        split = encoded[i:i+max_tokens]
-        splits.append(tokenizer.decode(split))
-    return splits
-# Function to generate text using CTranslate2
-def ocr_correction(prompt, max_new_tokens=500):
-    splits = split_text(prompt, max_tokens=500)
-    corrected_splits = []
-    list_prompts = []
-    for split in splits:
-        full_prompt = f"### Text ###\n{split}\n\n\n### Correction ###\n"
-        print(full_prompt)
-        encoded = tokenizer.encode(full_prompt)
-        prompt_tokens = tokenizer.convert_ids_to_tokens(encoded)
-        list_prompts.append(prompt_tokens)
-    results = generator.generate_batch(
-        list_prompts,
-        max_length=max_new_tokens,
-        sampling_temperature=0,
-        sampling_topk=20,
-        repetition_penalty=1.1,
-        include_prompt_in_result=False
-    )
-    for result in results:
-        corrected_text = tokenizer.decode(result.sequences_ids[0])
-        corrected_splits.append(corrected_text)
-    return " ".join(corrected_splits)
 # OCR Correction Class
 class OCRCorrector:
@@ -170,7 +214,7 @@ class TextProcessor:
     @spaces.GPU(duration=120)
     def process(self, user_message):
-        # OCR Correction
         corrected_text, html_diff = self.ocr_corrector.correct(user_message)
         # Combine results

 import spaces
 import transformers
 import re
+from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
+from vllm import LLM, SamplingParams
 import torch
 import gradio as gr
+import json
 import os
 import shutil
 import requests
+import pandas as pd
+import difflib
 from concurrent.futures import ThreadPoolExecutor
 # Define the device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# OCR Correction Model
+ocr_model_name = "PleIAs/OCRonos-Vintage"
+import torch
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+# Load pre-trained model and tokenizer
+model_name = "PleIAs/OCRonos-Vintage"
+model = GPT2LMHeadModel.from_pretrained(model_name)
+tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+# Set the device to GPU if available, otherwise use CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
 # CSS for formatting
 css = """
 <style>
     return text.strip()
 def split_text(text, max_tokens=500):
+    parts = text.split("\n")
+    chunks = []
+    current_chunk = ""
+    for part in parts:
+        if current_chunk:
+            temp_chunk = current_chunk + "\n" + part
+        else:
+            temp_chunk = part
+        num_tokens = len(tokenizer.tokenize(temp_chunk))
+        if num_tokens <= max_tokens:
+            current_chunk = temp_chunk
+        else:
+            if current_chunk:
+                chunks.append(current_chunk)
+            current_chunk = part
+    if current_chunk:
+        chunks.append(current_chunk)
+    if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
+        long_text = chunks[0]
+        chunks = []
+        while len(tokenizer.tokenize(long_text)) > max_tokens:
+            split_point = len(long_text) // 2
+            while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
+                split_point += 1
+            if split_point >= len(long_text):
+                split_point = len(long_text) - 1
+            chunks.append(long_text[:split_point].strip())
+            long_text = long_text[split_point:].strip()
+        if long_text:
+            chunks.append(long_text)
+    return chunks
+# Function to generate text
+def ocr_correction(prompt, max_new_tokens=600, num_threads=os.cpu_count()):
+    prompt = f"""### Text ###\n{prompt}\n\n\n### Correction ###\n"""
+    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    # Set the number of threads for PyTorch
+    torch.set_num_threads(num_threads)
+    # Generate text
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        future = executor.submit(
+            model.generate,
+            input_ids,
+            max_new_tokens=max_new_tokens,
+            pad_token_id=tokenizer.eos_token_id,
+            top_k=50,
+            num_return_sequences=1,
+            do_sample=True,
+            temperature=0.7
+        )
+        output = future.result()
+    # Decode and return the generated text
+    result = tokenizer.decode(output[0], skip_special_tokens=True)
+    print(result)
+    result = result.split("### Correction ###")[1]
+    return result
 # OCR Correction Class
 class OCRCorrector:
     @spaces.GPU(duration=120)
     def process(self, user_message):
+        #OCR Correction
         corrected_text, html_diff = self.ocr_corrector.correct(user_message)
         # Combine results