OCR-Gemini

Sleeping

App Files Files Community

Musharraf11 commited on Apr 8, 2025

Commit

4800daf

verified ·

1 Parent(s): 0a0f0fc

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -21

app.py CHANGED Viewed

@@ -17,48 +17,124 @@ if GEMINI_API_KEY:
     if uploaded_file:
         image = Image.open(uploaded_file)
-        st.image(image, caption="Uploaded Image", use_column_width=True)
         if st.button("Extract & Refine Text"):
             with st.spinner("Extracting text..."):
                 try:
                     # Initial Extraction
                     prompt_initial = [
-                        image,
-                        "You are an expert in extracting handwritten text from student answer scripts. Your task is to:\n\n"
-                        "1. Accurately extract the handwritten text exactly as it is.\n"
-                        "2. Carefully cross-check and correct only recognition errors (e.g., OCR mistakes).\n"
-                        "3. Do not modify or clean up grammatical or factual errors.\n"
-                        "4. If the student's script contains tables, diagrams, figures, or flowcharts, you must:\n"
-                        "- Reproduce tables in plain text or markdown table format, ensuring blank cells remain blank.\n"
-                        "- Recreate flowcharts using markdown-friendly formats (e.g., Mermaid).\n"
-                        "- For diagrams, recreate using ASCII art if possible.\n\n"
-                        "If the student's script contains any graphs (e.g., line graphs, bar charts, scatter plots with X and Y axes), recreate them using markdown-supported formats such as Mermaid.js or ASCII plots. Ensure the axes, labels, and plotted data points are captured accurately. If needed, describe the graph layout and its key data in words.\n\n"
-                        "Your role is to capture the student's original input precisely, as this will be evaluated by a human. Maintain full fidelity to the original script.\n\n"
-                        "Additionally, if there are any strikethrough texts, either preserve them with formatting (e.g., ~~strikethrough~~) or ignore them from the extraction."
-                    ]
                     response_initial = model.generate_content(prompt_initial)
                     extracted_text = response_initial.text
                     # Refinement Pass
                     prompt_refine = [
-                        image,
-                        f"Here is the initially extracted text:\n```\n{extracted_text}\n```\n"
-                        "Now, carefully correct any OCR errors **without modifying spelling mistakes, grammar, or factual content**:\n"
-                        "- Maintain markdown tables correctly aligned, ensuring blank cells remain blank.\n"
-                        "- Fix incorrect characters or missing words but do not autofill empty spaces.\n"
-                        "- Ensure strikethrough text is either preserved as `~~text~~` or removed based on the original formatting."
-                    ]
                     response_refine = model.generate_content(prompt_refine)
                     refined_text = response_refine.text
                     # Display results
                     st.subheader("✅ Final Extracted Text:")
                     st.markdown(refined_text, unsafe_allow_html=True)
                     st.code(refined_text, language="text")
                     # Allow Download
                     st.download_button("⬇️ Download Markdown", refined_text, file_name="extracted_text.md", mime="text/markdown")

     if uploaded_file:
         image = Image.open(uploaded_file)
+        st.image(image, caption="Uploaded Image", use_container_width=True)
         if st.button("Extract & Refine Text"):
             with st.spinner("Extracting text..."):
                 try:
                     # Initial Extraction
                     prompt_initial = [
+                    image,
+                    f"""You are an expert in extracting handwritten text from scanned student answer scripts.
+                    Your task is to accurately extract the handwritten content while ensuring fidelity to the original writing style and structure.
+                    Follow these strict guidelines:
+                    1. **Accurate Extraction:** Extract the handwritten text exactly as it appears, preserving original spacing, punctuation, and line breaks.
+                    2. **Handling Scratched-Out Text:**
+                    - If a word or phrase is visibly scratched out (e.g., crossed out using lines or scribbles):
+                    - Either omit it entirely from the extracted text **OR**
+                    - Retain it but apply strikethrough formatting using `~~word~~` (Markdown format).
+                    - Do not attempt to guess or reconstruct scratched-out words.
+                    Excluding Diagrams, Graphs, Tables, and Equations:
+                    - **Diagrams & Flowcharts:** Detect and exclude them, replacing them with '[Diagram Detected: This section contains a diagram or flowchart illustrating a concept or process.]'. Ensure that the surrounding text structure remains intact.
+                    - **Graphs:** Identify and omit graphs (e.g., bar charts, line graphs, scatter plots), replacing them with '[Graph Detected: This section contains a visual representation of data, such as trends, distributions, or comparisons.]'. Ensure that surrounding text remains structured and readable.
+                    - **Tables:** Do not extract table contents. Instead, insert '[Table Detected: A structured table with numerical or categorical data is present.]' in the extracted text to indicate omitted tabular data while maintaining text alignment.
+                    - **Equations & Expressions:** Omit standalone mathematical formulas while maintaining appropriate spacing. If a mathematical expression is detected, replace it with '[Equation Detected: A mathematical formula or expression is present.]'.
+                    Whenever a table, diagram, figure, or flowchart is encountered in the document, **explicitly mention its presence with a relevant description**, but do not attempt to extract or reproduce its contents. The extracted text should remain structured, and no part of a diagram, table, or equation should be reconstructed in any form.
+                    4. **Preserving Annotations:**
+                    - If additional notes, comments, or margin annotations are present, extract them separately and label them as **"Annotations"**.
+                    5. **No Grammar or Content Correction:**
+                    - Do not modify spelling mistakes, grammar, or factual content. Extract the text exactly as written, correcting only **recognition errors** (e.g., misidentified characters).
+                    6. **Ensure High Accuracy:**
+                    - Cross-check extracted text to prevent common OCR errors, such as misreading '1' as 'l' or 'O' as '0'.
+                    """
+                ]
                     response_initial = model.generate_content(prompt_initial)
                     extracted_text = response_initial.text
                     # Refinement Pass
                     prompt_refine = [
+                    image,
+                    f"""You are an expert in handwritten text recognition and refinement, specializing in ensuring the highest accuracy in extracted text from student answer scripts.
+                    Your task is to carefully compare the initially extracted text with the handwritten content in the provided image and make precise corrections to eliminate any OCR recognition errors.
+                    **Guidelines:**
+                    1. **Exact Character Matching:**
+                    - Ensure that every character, symbol, and special notation (e.g., mathematical symbols like ∫, λ, π, Σ, ∂, Greek letters, and subscript/superscript text) is accurately extracted.
+                    - Pay close attention to case sensitivity, ensuring that uppercase and lowercase letters are correctly identified.
+                    2. **Handling of Scratched-Out Text:**
+                    - If any word or phrase is scratched out, either:
+                        - Completely **omit it** from the refined text.
+                        - OR retain it with **strikethrough formatting** using `~~scratched text~~` (Markdown format).
+                    - Do **not** attempt to reconstruct or infer words that have been heavily scratched out beyond recognition.
+                    Excluding Diagrams, Graphs, Tables, and Equations:
+                    - **Diagrams & Flowcharts:** Detect and exclude them, replacing them with '[Diagram Detected: This section contains a diagram or flowchart illustrating a concept or process.]'. Ensure that the surrounding text structure remains intact.
+                    - **Graphs:** Identify and omit graphs (e.g., bar charts, line graphs, scatter plots), replacing them with '[Graph Detected: This section contains a visual representation of data, such as trends, distributions, or comparisons.]'. Ensure that surrounding text remains structured and readable.
+                    - **Tables:** Do not extract table contents. Instead, insert '[Table Detected: A structured table with numerical or categorical data is present.]' in the extracted text to indicate omitted tabular data while maintaining text alignment.
+                    - **Equations & Expressions:** Omit standalone mathematical formulas while maintaining appropriate spacing. If a mathematical expression is detected, replace it with '[Equation Detected: A mathematical formula or expression is present.]'.
+                    Whenever a table, diagram, figure, or flowchart is encountered in the document, **explicitly mention its presence with a relevant description**, but do not attempt to extract or reproduce its contents. The extracted text should remain structured, and no part of a diagram, table, or equation should be reconstructed in any form.
+                    4. **Preserving Formatting & Spacing:**
+                    - Maintain the **exact structure** of the handwritten text, including spacing, punctuation, and line breaks.
+                    - If words are separated by unusual spacing, preserve that spacing in the extracted text.
+                    5. **Correction of OCR Recognition Errors:**
+                    - Identify and fix **common OCR mistakes**, such as:
+                        - Misinterpreting **numbers and letters** (e.g., ‘1’ vs. ‘l’, ‘0’ vs. ‘O’).
+                        - Incorrectly recognizing **special characters** (e.g., ‘∑’ mistaken as ‘E’).
+                        - Missing **accents or diacritics** (e.g., é, ü, ñ).
+                    - If a character is ambiguous, cross-check it against the handwriting to determine the most likely correct representation.
+                    6. **No Grammar or Content Modification:**
+                    - Do **not** alter spelling, grammar, or factual content.
+                    - Only correct recognition errors—do not "fix" perceived mistakes made by the writer.
+                    7. **Final Validation:**
+                    - Before finalizing the output, perform a second verification pass to ensure that **every** character exactly matches the handwritten content in the image.
+                    **Final Output Format:**
+                    - The **refined text** should be provided in **plain text or Markdown format**, ensuring readability while preserving structure and accuracy.
+                    - Clearly indicate any corrections made compared to the initially extracted text.
+                    """
+                ]
                     response_refine = model.generate_content(prompt_refine)
                     refined_text = response_refine.text
+                    flagged_elements = []
+                    keywords = {
+                        "Diagrams": ["[Diagram Detected]", "diagram", "flowchart"],
+                        "Graphs": ["[Graph Detected]", "graph", "chart"],
+                        "Tables": ["[Table Detected]", "table"],
+                        "Equations": ["[Equation Detected]", "equation", "formula"]
+                    }
+                    for key, words in keywords.items():
+                        if any(word.lower() in refined_text.lower() for word in words):
+                            flagged_elements.append(key)
                     # Display results
                     st.subheader("✅ Final Extracted Text:")
                     st.markdown(refined_text, unsafe_allow_html=True)
                     st.code(refined_text, language="text")
+                    # Display warning if flagged elements were found
+                    if flagged_elements:
+                        st.warning(f"⚠️ The extracted text contains: {', '.join(flagged_elements)}. Please review these sections manually.")
                     # Allow Download
                     st.download_button("⬇️ Download Markdown", refined_text, file_name="extracted_text.md", mime="text/markdown")