Spaces:

chiichann
/

midterm_project_historical_document_deciphering_app

Sleeping

App Files Files Community

chiichann commited on Apr 10, 2025

Commit

48f9fac

verified ·

1 Parent(s): 586280c

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -21

app.py CHANGED Viewed

@@ -11,7 +11,13 @@ from difflib import SequenceMatcher
 import folium
 from streamlit_folium import st_folium
 import wikipediaapi
 wiki_wiki = wikipediaapi.Wikipedia(
     language='en',
     user_agent='AI-Historical-Doc-App/1.0 (contact: cherilynmarie.deocampo@wvsu.edu.com)'
@@ -51,14 +57,14 @@ def enhance_image(image):
     # Sharpening
     kernel = np.array([[0, -1, 0],
-                       [-1, 5,-1],
                        [0, -1, 0]])
     sharpened = cv2.filter2D(denoised, -1, kernel)
     # Thresholding (binarization)
     _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    # Optional: Resize (sometimes helps OCR)
     scale_percent = 150
     width = int(binary.shape[1] * scale_percent / 100)
     height = int(binary.shape[0] * scale_percent / 100)
@@ -66,7 +72,7 @@ def enhance_image(image):
     return resized
-# Function to perform OCR using EasyOCR
 def perform_ocr(image):
     if isinstance(image, np.ndarray):
         img_array = image
@@ -77,7 +83,7 @@ def perform_ocr(image):
     text = '\n'.join(results)
     return text
-# Function to extract named entities
 def extract_entities(text):
     entities = ner_pipeline(text)
     extracted = {}
@@ -86,18 +92,28 @@ def extract_entities(text):
         extracted.setdefault(label, set()).add(ent['word'])
     return extracted
 def get_historical_context(entities):
     context = {}
     for label, values in entities.items():
         for item in values:
-            page = wiki_wiki.page(item)
-            if page.exists():
-                context[item] = page.summary[:500]  # Limit summary length
-            else:
-                context[item] = f"No historical info found for '{item}'."
     return context
-# Function to correct OCR errors (suggestions)
 def suggest_corrections(original_text):
     words = original_text.split()
     suggestions = {}
@@ -109,23 +125,23 @@ def suggest_corrections(original_text):
                 suggestions[word] = close_matches[0]
     return suggestions
-# Function to generate map
 def generate_map(entities):
     m = folium.Map(location=[20, 0], zoom_start=2)
     if "LOC" in entities:
         for location in entities["LOC"]:
-            # Dummy coordinates for demonstration
             folium.Marker(
-                location=[51.5074, -0.1278],  # Example: London
                 popup=f"Location: {location}",
                 tooltip=location
             ).add_to(m)
     return m
 if uploaded_file:
     file_type = uploaded_file.type
-    # Display and process the uploaded document
     if file_type == "application/pdf":
         images = pdf_to_images(uploaded_file.read())
     else:
@@ -134,42 +150,35 @@ if uploaded_file:
     for image in images:
         st.image(image, caption="Uploaded Document", use_container_width=True)
-        # Enhance image
         enhanced = enhance_image(image)
         st.image(enhanced, caption="Enhanced Image", use_container_width=True, channels="GRAY")
-        # Perform OCR
         ocr_text = perform_ocr(enhanced)
         st.subheader("Extracted Text (OCR)")
         st.text_area("Text", ocr_text, height=200)
-        # Suggest corrections
         corrections = suggest_corrections(ocr_text)
         if corrections:
             st.subheader("AI Suggestions for Possible Corrections")
             for original, suggestion in corrections.items():
                 st.markdown(f"**{original}** ➔ *{suggestion}*")
-        # Summarize text
         if len(ocr_text.strip()) > 50:
             summary = summarizer(ocr_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
             st.subheader("Summary")
             st.write(summary)
-        # Extract entities
         entities = extract_entities(ocr_text)
         st.subheader("Key Information")
         for label, items in entities.items():
             st.markdown(f"**{label}**: {', '.join(items)}")
-        # Provide historical context
         context = get_historical_context(entities)
         if context:
             st.subheader("Historical Context & Insights")
             for item, info in context.items():
                 st.markdown(f"- **{item}**: {info}")
-        # Visualize map
         st.subheader("Locations Mentioned")
         map_ = generate_map(entities)
         st_folium(map_, width=700)

 import folium
 from streamlit_folium import st_folium
 import wikipediaapi
+import logging
+import re
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+# Wikipedia API setup
 wiki_wiki = wikipediaapi.Wikipedia(
     language='en',
     user_agent='AI-Historical-Doc-App/1.0 (contact: cherilynmarie.deocampo@wvsu.edu.com)'
     # Sharpening
     kernel = np.array([[0, -1, 0],
+                       [-1, 5, -1],
                        [0, -1, 0]])
     sharpened = cv2.filter2D(denoised, -1, kernel)
     # Thresholding (binarization)
     _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # Resize
     scale_percent = 150
     width = int(binary.shape[1] * scale_percent / 100)
     height = int(binary.shape[0] * scale_percent / 100)
     return resized
+# OCR
 def perform_ocr(image):
     if isinstance(image, np.ndarray):
         img_array = image
     text = '\n'.join(results)
     return text
+# Extract named entities
 def extract_entities(text):
     entities = ner_pipeline(text)
     extracted = {}
         extracted.setdefault(label, set()).add(ent['word'])
     return extracted
+# Clean extracted entities for Wikipedia
+def clean_entity(text):
+    return re.sub(r"[^\w\s]", "", text).strip()
+# Historical context fetcher
 def get_historical_context(entities):
     context = {}
     for label, values in entities.items():
         for item in values:
+            cleaned_item = clean_entity(item)
+            try:
+                page = wiki_wiki.page(cleaned_item)
+                if page.exists():
+                    context[item] = page.summary[:500]  # Limit summary
+                else:
+                    context[item] = f"No historical info found for '{item}'."
+            except Exception as e:
+                logging.warning(f"Wikipedia lookup failed for '{item}': {e}")
+                context[item] = f"Error fetching data for '{item}': {e}"
     return context
+# Suggest corrections
 def suggest_corrections(original_text):
     words = original_text.split()
     suggestions = {}
                 suggestions[word] = close_matches[0]
     return suggestions
+# Generate map
 def generate_map(entities):
     m = folium.Map(location=[20, 0], zoom_start=2)
     if "LOC" in entities:
         for location in entities["LOC"]:
+            # Dummy coordinates
             folium.Marker(
+                location=[51.5074, -0.1278],
                 popup=f"Location: {location}",
                 tooltip=location
             ).add_to(m)
     return m
+# Main process
 if uploaded_file:
     file_type = uploaded_file.type
     if file_type == "application/pdf":
         images = pdf_to_images(uploaded_file.read())
     else:
     for image in images:
         st.image(image, caption="Uploaded Document", use_container_width=True)
         enhanced = enhance_image(image)
         st.image(enhanced, caption="Enhanced Image", use_container_width=True, channels="GRAY")
         ocr_text = perform_ocr(enhanced)
         st.subheader("Extracted Text (OCR)")
         st.text_area("Text", ocr_text, height=200)
         corrections = suggest_corrections(ocr_text)
         if corrections:
             st.subheader("AI Suggestions for Possible Corrections")
             for original, suggestion in corrections.items():
                 st.markdown(f"**{original}** ➔ *{suggestion}*")
         if len(ocr_text.strip()) > 50:
             summary = summarizer(ocr_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
             st.subheader("Summary")
             st.write(summary)
         entities = extract_entities(ocr_text)
         st.subheader("Key Information")
         for label, items in entities.items():
             st.markdown(f"**{label}**: {', '.join(items)}")
         context = get_historical_context(entities)
         if context:
             st.subheader("Historical Context & Insights")
             for item, info in context.items():
                 st.markdown(f"- **{item}**: {info}")
         st.subheader("Locations Mentioned")
         map_ = generate_map(entities)
         st_folium(map_, width=700)