Spaces:
Sleeping
Sleeping
Update app.py
Browse filesUpdated App.py
app.py
CHANGED
|
@@ -40,6 +40,22 @@ def init_qwen_model():
|
|
| 40 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
| 41 |
return model.eval(), processor
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Extract text using GOT
|
| 44 |
def extract_text_got(image_file, model, tokenizer):
|
| 45 |
return model.chat(tokenizer, image_file, ocr_type='ocr')
|
|
@@ -57,13 +73,6 @@ def extract_text_qwen(image_file, model, processor):
|
|
| 57 |
except Exception as e:
|
| 58 |
return f"An error occurred: {str(e)}"
|
| 59 |
|
| 60 |
-
# Text Cleaning AI - Clean spaces, handle dual languages
|
| 61 |
-
def clean_extracted_text(text):
|
| 62 |
-
# Remove extra spaces
|
| 63 |
-
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
| 64 |
-
cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
|
| 65 |
-
return cleaned_text
|
| 66 |
-
|
| 67 |
# Highlight keyword search
|
| 68 |
def highlight_text(text, search_term):
|
| 69 |
if not search_term:
|
|
@@ -126,19 +135,25 @@ if predict_button and uploaded_file:
|
|
| 126 |
# Clean extracted text
|
| 127 |
cleaned_text = clean_extracted_text(extracted_text)
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
# Delete temp file
|
| 130 |
if os.path.exists(temp_file_path):
|
| 131 |
os.remove(temp_file_path)
|
| 132 |
|
| 133 |
# Display extracted text and search functionality
|
| 134 |
-
st.subheader("Extracted Text (Cleaned)")
|
| 135 |
-
st.markdown(
|
| 136 |
|
| 137 |
search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
|
| 138 |
if search_query:
|
| 139 |
-
highlighted_text = highlight_text(
|
| 140 |
st.markdown("### Highlighted Search Results:")
|
| 141 |
st.markdown(highlighted_text, unsafe_allow_html=True)
|
| 142 |
else:
|
| 143 |
st.markdown("### Extracted Text:")
|
| 144 |
-
st.markdown(
|
|
|
|
| 40 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
| 41 |
return model.eval(), processor
|
| 42 |
|
| 43 |
+
# Text Cleaning AI - Clean spaces, handle dual languages
|
| 44 |
+
def clean_extracted_text(text):
|
| 45 |
+
# Remove extra spaces
|
| 46 |
+
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
| 47 |
+
cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
|
| 48 |
+
return cleaned_text
|
| 49 |
+
|
| 50 |
+
# Polish the text using a model
|
| 51 |
+
def polish_text_with_ai(cleaned_text, model, tokenizer):
|
| 52 |
+
# Use a text generation pipeline for better language flow
|
| 53 |
+
from transformers import pipeline
|
| 54 |
+
nlp = pipeline('text-generation', model=model, tokenizer=tokenizer)
|
| 55 |
+
prompt = f"Correct and clean the following text: '{cleaned_text}' and make it meaningful."
|
| 56 |
+
polished_text = nlp(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
|
| 57 |
+
return polished_text
|
| 58 |
+
|
| 59 |
# Extract text using GOT
|
| 60 |
def extract_text_got(image_file, model, tokenizer):
|
| 61 |
return model.chat(tokenizer, image_file, ocr_type='ocr')
|
|
|
|
| 73 |
except Exception as e:
|
| 74 |
return f"An error occurred: {str(e)}"
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
# Highlight keyword search
|
| 77 |
def highlight_text(text, search_term):
|
| 78 |
if not search_term:
|
|
|
|
| 135 |
# Clean extracted text
|
| 136 |
cleaned_text = clean_extracted_text(extracted_text)
|
| 137 |
|
| 138 |
+
# Optionally, polish text with AI model for better language flow
|
| 139 |
+
if model_choice in ["GOT_CPU", "GOT_GPU"]:
|
| 140 |
+
polished_text = polish_text_with_ai(cleaned_text, got_model, tokenizer)
|
| 141 |
+
else:
|
| 142 |
+
polished_text = cleaned_text
|
| 143 |
+
|
| 144 |
# Delete temp file
|
| 145 |
if os.path.exists(temp_file_path):
|
| 146 |
os.remove(temp_file_path)
|
| 147 |
|
| 148 |
# Display extracted text and search functionality
|
| 149 |
+
st.subheader("Extracted Text (Cleaned & Polished)")
|
| 150 |
+
st.markdown(polished_text, unsafe_allow_html=True)
|
| 151 |
|
| 152 |
search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
|
| 153 |
if search_query:
|
| 154 |
+
highlighted_text = highlight_text(polished_text, search_query)
|
| 155 |
st.markdown("### Highlighted Search Results:")
|
| 156 |
st.markdown(highlighted_text, unsafe_allow_html=True)
|
| 157 |
else:
|
| 158 |
st.markdown("### Extracted Text:")
|
| 159 |
+
st.markdown(polished_text, unsafe_allow_html=True)
|