Spaces:
Sleeping
Sleeping
Update ocr_cpu.py
Browse filesNew Factor ocr_cpu.py
- ocr_cpu.py +17 -13
ocr_cpu.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
-
from transformers import AutoModel, AutoTokenizer
|
| 3 |
import torch
|
| 4 |
-
import
|
| 5 |
|
| 6 |
# Load model and tokenizer
|
| 7 |
model_name = "srimanth-d/GOT_CPU" # Using GOT model on CPU
|
|
@@ -57,17 +56,22 @@ def extract_text_got(uploaded_file):
|
|
| 57 |
os.remove(temp_file_path)
|
| 58 |
print(f"Temporary file {temp_file_path} removed.")
|
| 59 |
|
| 60 |
-
# Function to clean extracted text
|
| 61 |
-
def
|
| 62 |
"""
|
| 63 |
-
Cleans extracted text by
|
| 64 |
"""
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import torch
|
| 3 |
+
from transformers import AutoModel, AutoTokenizer
|
| 4 |
|
| 5 |
# Load model and tokenizer
|
| 6 |
model_name = "srimanth-d/GOT_CPU" # Using GOT model on CPU
|
|
|
|
| 56 |
os.remove(temp_file_path)
|
| 57 |
print(f"Temporary file {temp_file_path} removed.")
|
| 58 |
|
| 59 |
+
# Function to clean extracted text using AI
|
| 60 |
+
def clean_text_with_ai(extracted_text):
|
| 61 |
"""
|
| 62 |
+
Cleans extracted text by leveraging an AI model to intelligently remove extra spaces.
|
| 63 |
"""
|
| 64 |
+
try:
|
| 65 |
+
# Prepare the input for the AI model
|
| 66 |
+
inputs = tokenizer(extracted_text, return_tensors="pt").to(device)
|
| 67 |
+
|
| 68 |
+
# Generate cleaned text using the AI model
|
| 69 |
+
with torch.no_grad():
|
| 70 |
+
outputs = model.generate(**inputs, max_new_tokens=100) # Adjust max_new_tokens as needed
|
| 71 |
|
| 72 |
+
# Decode the generated output
|
| 73 |
+
cleaned_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 74 |
+
|
| 75 |
+
return cleaned_text.strip() # Return the cleaned text
|
| 76 |
+
except Exception as e:
|
| 77 |
+
return f"Error during AI text cleaning: {str(e)}"
|