Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,48 +17,124 @@ if GEMINI_API_KEY:
|
|
| 17 |
|
| 18 |
if uploaded_file:
|
| 19 |
image = Image.open(uploaded_file)
|
| 20 |
-
st.image(image, caption="Uploaded Image",
|
| 21 |
|
| 22 |
if st.button("Extract & Refine Text"):
|
| 23 |
with st.spinner("Extracting text..."):
|
| 24 |
try:
|
| 25 |
# Initial Extraction
|
| 26 |
prompt_initial = [
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
response_initial = model.generate_content(prompt_initial)
|
| 42 |
extracted_text = response_initial.text
|
| 43 |
|
| 44 |
# Refinement Pass
|
| 45 |
prompt_refine = [
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
response_refine = model.generate_content(prompt_refine)
|
| 55 |
refined_text = response_refine.text
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# Display results
|
| 58 |
st.subheader("✅ Final Extracted Text:")
|
| 59 |
st.markdown(refined_text, unsafe_allow_html=True)
|
| 60 |
st.code(refined_text, language="text")
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
# Allow Download
|
| 63 |
st.download_button("⬇️ Download Markdown", refined_text, file_name="extracted_text.md", mime="text/markdown")
|
| 64 |
|
|
|
|
| 17 |
|
| 18 |
if uploaded_file:
|
| 19 |
image = Image.open(uploaded_file)
|
| 20 |
+
st.image(image, caption="Uploaded Image", use_container_width=True)
|
| 21 |
|
| 22 |
if st.button("Extract & Refine Text"):
|
| 23 |
with st.spinner("Extracting text..."):
|
| 24 |
try:
|
| 25 |
# Initial Extraction
|
| 26 |
prompt_initial = [
|
| 27 |
+
image,
|
| 28 |
+
f"""You are an expert in extracting handwritten text from scanned student answer scripts.
|
| 29 |
+
Your task is to accurately extract the handwritten content while ensuring fidelity to the original writing style and structure.
|
| 30 |
+
Follow these strict guidelines:
|
| 31 |
+
|
| 32 |
+
1. **Accurate Extraction:** Extract the handwritten text exactly as it appears, preserving original spacing, punctuation, and line breaks.
|
| 33 |
+
|
| 34 |
+
2. **Handling Scratched-Out Text:**
|
| 35 |
+
- If a word or phrase is visibly scratched out (e.g., crossed out using lines or scribbles):
|
| 36 |
+
- Either omit it entirely from the extracted text **OR**
|
| 37 |
+
- Retain it but apply strikethrough formatting using `~~word~~` (Markdown format).
|
| 38 |
+
- Do not attempt to guess or reconstruct scratched-out words.
|
| 39 |
+
|
| 40 |
+
Excluding Diagrams, Graphs, Tables, and Equations:
|
| 41 |
+
- **Diagrams & Flowcharts:** Detect and exclude them, replacing them with '[Diagram Detected: This section contains a diagram or flowchart illustrating a concept or process.]'. Ensure that the surrounding text structure remains intact.
|
| 42 |
+
- **Graphs:** Identify and omit graphs (e.g., bar charts, line graphs, scatter plots), replacing them with '[Graph Detected: This section contains a visual representation of data, such as trends, distributions, or comparisons.]'. Ensure that surrounding text remains structured and readable.
|
| 43 |
+
- **Tables:** Do not extract table contents. Instead, insert '[Table Detected: A structured table with numerical or categorical data is present.]' in the extracted text to indicate omitted tabular data while maintaining text alignment.
|
| 44 |
+
- **Equations & Expressions:** Omit standalone mathematical formulas while maintaining appropriate spacing. If a mathematical expression is detected, replace it with '[Equation Detected: A mathematical formula or expression is present.]'.
|
| 45 |
+
|
| 46 |
+
Whenever a table, diagram, figure, or flowchart is encountered in the document, **explicitly mention its presence with a relevant description**, but do not attempt to extract or reproduce its contents. The extracted text should remain structured, and no part of a diagram, table, or equation should be reconstructed in any form.
|
| 47 |
+
4. **Preserving Annotations:**
|
| 48 |
+
- If additional notes, comments, or margin annotations are present, extract them separately and label them as **"Annotations"**.
|
| 49 |
+
|
| 50 |
+
5. **No Grammar or Content Correction:**
|
| 51 |
+
- Do not modify spelling mistakes, grammar, or factual content. Extract the text exactly as written, correcting only **recognition errors** (e.g., misidentified characters).
|
| 52 |
+
|
| 53 |
+
6. **Ensure High Accuracy:**
|
| 54 |
+
- Cross-check extracted text to prevent common OCR errors, such as misreading '1' as 'l' or 'O' as '0'.
|
| 55 |
+
"""
|
| 56 |
+
]
|
| 57 |
|
| 58 |
response_initial = model.generate_content(prompt_initial)
|
| 59 |
extracted_text = response_initial.text
|
| 60 |
|
| 61 |
# Refinement Pass
|
| 62 |
prompt_refine = [
|
| 63 |
+
image,
|
| 64 |
+
f"""You are an expert in handwritten text recognition and refinement, specializing in ensuring the highest accuracy in extracted text from student answer scripts.
|
| 65 |
+
|
| 66 |
+
Your task is to carefully compare the initially extracted text with the handwritten content in the provided image and make precise corrections to eliminate any OCR recognition errors.
|
| 67 |
+
|
| 68 |
+
**Guidelines:**
|
| 69 |
+
|
| 70 |
+
1. **Exact Character Matching:**
|
| 71 |
+
- Ensure that every character, symbol, and special notation (e.g., mathematical symbols like ∫, λ, π, Σ, ∂, Greek letters, and subscript/superscript text) is accurately extracted.
|
| 72 |
+
- Pay close attention to case sensitivity, ensuring that uppercase and lowercase letters are correctly identified.
|
| 73 |
+
|
| 74 |
+
2. **Handling of Scratched-Out Text:**
|
| 75 |
+
- If any word or phrase is scratched out, either:
|
| 76 |
+
- Completely **omit it** from the refined text.
|
| 77 |
+
- OR retain it with **strikethrough formatting** using `~~scratched text~~` (Markdown format).
|
| 78 |
+
- Do **not** attempt to reconstruct or infer words that have been heavily scratched out beyond recognition.
|
| 79 |
+
|
| 80 |
+
Excluding Diagrams, Graphs, Tables, and Equations:
|
| 81 |
+
- **Diagrams & Flowcharts:** Detect and exclude them, replacing them with '[Diagram Detected: This section contains a diagram or flowchart illustrating a concept or process.]'. Ensure that the surrounding text structure remains intact.
|
| 82 |
+
- **Graphs:** Identify and omit graphs (e.g., bar charts, line graphs, scatter plots), replacing them with '[Graph Detected: This section contains a visual representation of data, such as trends, distributions, or comparisons.]'. Ensure that surrounding text remains structured and readable.
|
| 83 |
+
- **Tables:** Do not extract table contents. Instead, insert '[Table Detected: A structured table with numerical or categorical data is present.]' in the extracted text to indicate omitted tabular data while maintaining text alignment.
|
| 84 |
+
- **Equations & Expressions:** Omit standalone mathematical formulas while maintaining appropriate spacing. If a mathematical expression is detected, replace it with '[Equation Detected: A mathematical formula or expression is present.]'.
|
| 85 |
+
|
| 86 |
+
Whenever a table, diagram, figure, or flowchart is encountered in the document, **explicitly mention its presence with a relevant description**, but do not attempt to extract or reproduce its contents. The extracted text should remain structured, and no part of a diagram, table, or equation should be reconstructed in any form.
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
4. **Preserving Formatting & Spacing:**
|
| 90 |
+
- Maintain the **exact structure** of the handwritten text, including spacing, punctuation, and line breaks.
|
| 91 |
+
- If words are separated by unusual spacing, preserve that spacing in the extracted text.
|
| 92 |
+
|
| 93 |
+
5. **Correction of OCR Recognition Errors:**
|
| 94 |
+
- Identify and fix **common OCR mistakes**, such as:
|
| 95 |
+
- Misinterpreting **numbers and letters** (e.g., ‘1’ vs. ‘l’, ‘0’ vs. ‘O’).
|
| 96 |
+
- Incorrectly recognizing **special characters** (e.g., ‘∑’ mistaken as ‘E’).
|
| 97 |
+
- Missing **accents or diacritics** (e.g., é, ü, ñ).
|
| 98 |
+
- If a character is ambiguous, cross-check it against the handwriting to determine the most likely correct representation.
|
| 99 |
+
|
| 100 |
+
6. **No Grammar or Content Modification:**
|
| 101 |
+
- Do **not** alter spelling, grammar, or factual content.
|
| 102 |
+
- Only correct recognition errors—do not "fix" perceived mistakes made by the writer.
|
| 103 |
+
|
| 104 |
+
7. **Final Validation:**
|
| 105 |
+
- Before finalizing the output, perform a second verification pass to ensure that **every** character exactly matches the handwritten content in the image.
|
| 106 |
+
|
| 107 |
+
**Final Output Format:**
|
| 108 |
+
- The **refined text** should be provided in **plain text or Markdown format**, ensuring readability while preserving structure and accuracy.
|
| 109 |
+
- Clearly indicate any corrections made compared to the initially extracted text.
|
| 110 |
+
"""
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
|
| 114 |
response_refine = model.generate_content(prompt_refine)
|
| 115 |
refined_text = response_refine.text
|
| 116 |
|
| 117 |
+
flagged_elements = []
|
| 118 |
+
keywords = {
|
| 119 |
+
"Diagrams": ["[Diagram Detected]", "diagram", "flowchart"],
|
| 120 |
+
"Graphs": ["[Graph Detected]", "graph", "chart"],
|
| 121 |
+
"Tables": ["[Table Detected]", "table"],
|
| 122 |
+
"Equations": ["[Equation Detected]", "equation", "formula"]
|
| 123 |
+
}
|
| 124 |
+
for key, words in keywords.items():
|
| 125 |
+
if any(word.lower() in refined_text.lower() for word in words):
|
| 126 |
+
flagged_elements.append(key)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
# Display results
|
| 130 |
st.subheader("✅ Final Extracted Text:")
|
| 131 |
st.markdown(refined_text, unsafe_allow_html=True)
|
| 132 |
st.code(refined_text, language="text")
|
| 133 |
|
| 134 |
+
# Display warning if flagged elements were found
|
| 135 |
+
if flagged_elements:
|
| 136 |
+
st.warning(f"⚠️ The extracted text contains: {', '.join(flagged_elements)}. Please review these sections manually.")
|
| 137 |
+
|
| 138 |
# Allow Download
|
| 139 |
st.download_button("⬇️ Download Markdown", refined_text, file_name="extracted_text.md", mime="text/markdown")
|
| 140 |
|