import streamlit as st from PIL import Image import easyocr import io import fitz # PyMuPDF import cv2 import numpy as np import requests from transformers import pipeline from difflib import SequenceMatcher import folium from streamlit_folium import st_folium import wikipediaapi import logging import re # Configure logging logging.basicConfig(level=logging.INFO) # Wikipedia API setup wiki_wiki = wikipediaapi.Wikipedia( language='en', user_agent='AI-Historical-Doc-App/1.0 (contact: cherilynmarie.deocampo@wvsu.edu.com)' ) # Load summarization and NER pipeline summarizer = pipeline("summarization", model="facebook/bart-large-cnn") ner_pipeline = pipeline("ner", aggregation_strategy="simple") # Initialize EasyOCR reader reader = easyocr.Reader(['en'], gpu=False) # Streamlit App st.set_page_config(page_title="AI Historical Document Decipher", layout="wide") st.title("📜 AI-powered Historical Document Deciphering App") st.sidebar.header("Upload Document") uploaded_file = st.sidebar.file_uploader("Upload Image or PDF", type=["jpg", "jpeg", "png", "pdf"]) # Function to convert PDF to image def pdf_to_images(pdf_bytes): doc = fitz.open(stream=pdf_bytes, filetype="pdf") images = [] for page in doc: pix = page.get_pixmap() img = Image.open(io.BytesIO(pix.tobytes())) images.append(img) return images # Function to enhance image def enhance_image(image): img = np.array(image.convert('RGB')) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Denoise denoised = cv2.fastNlMeansDenoising(gray, h=30) # Sharpening kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) sharpened = cv2.filter2D(denoised, -1, kernel) # Thresholding (binarization) _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # Resize scale_percent = 150 width = int(binary.shape[1] * scale_percent / 100) height = int(binary.shape[0] * scale_percent / 100) resized = cv2.resize(binary, (width, height), interpolation=cv2.INTER_CUBIC) return resized # OCR def perform_ocr(image): if isinstance(image, np.ndarray): img_array = image else: img_array = np.array(image.convert('RGB')) results = reader.readtext(img_array, detail=0) text = '\n'.join(results) return text # Extract named entities def extract_entities(text): entities = ner_pipeline(text) extracted = {} for ent in entities: label = ent['entity_group'] extracted.setdefault(label, set()).add(ent['word']) return extracted # Clean extracted entities for Wikipedia def clean_entity(text): return re.sub(r"[^\w\s]", "", text).strip() # Historical context fetcher def get_historical_context(entities): context = {} for label, values in entities.items(): for item in values: cleaned_item = clean_entity(item) try: page = wiki_wiki.page(cleaned_item) if page.exists(): context[item] = page.summary[:500] # Limit summary else: context[item] = f"No historical info found for '{item}'." except Exception as e: logging.warning(f"Wikipedia lookup failed for '{item}': {e}") context[item] = f"Error fetching data for '{item}': {e}" return context # Suggest corrections def suggest_corrections(original_text): words = original_text.split() suggestions = {} for word in words: if len(word) > 4 and not word.isnumeric(): close_matches = [w for w in ["document", "historical", "archive", "event", "location"] if SequenceMatcher(None, word.lower(), w).ratio() > 0.75] if close_matches: suggestions[word] = close_matches[0] return suggestions # Generate map def generate_map(entities): m = folium.Map(location=[20, 0], zoom_start=2) if "LOC" in entities: for location in entities["LOC"]: # Dummy coordinates folium.Marker( location=[51.5074, -0.1278], popup=f"Location: {location}", tooltip=location ).add_to(m) return m # Main process if uploaded_file: file_type = uploaded_file.type if file_type == "application/pdf": images = pdf_to_images(uploaded_file.read()) else: images = [Image.open(uploaded_file)] for image in images: st.image(image, caption="Uploaded Document", use_container_width=True) enhanced = enhance_image(image) st.image(enhanced, caption="Enhanced Image", use_container_width=True, channels="GRAY") ocr_text = perform_ocr(enhanced) st.subheader("Extracted Text (OCR)") st.text_area("Text", ocr_text, height=200) corrections = suggest_corrections(ocr_text) if corrections: st.subheader("AI Suggestions for Possible Corrections") for original, suggestion in corrections.items(): st.markdown(f"**{original}** ➔ *{suggestion}*") if len(ocr_text.strip()) > 50: summary = summarizer(ocr_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text'] st.subheader("Summary") st.write(summary) entities = extract_entities(ocr_text) st.subheader("Key Information") for label, items in entities.items(): st.markdown(f"**{label}**: {', '.join(items)}") context = get_historical_context(entities) if context: st.subheader("Historical Context & Insights") for item, info in context.items(): st.markdown(f"- **{item}**: {info}") st.subheader("Locations Mentioned") map_ = generate_map(entities) st_folium(map_, width=700) st.markdown("---") else: st.info("Upload an image or PDF of a historical document to begin.") st.sidebar.markdown("---") st.sidebar.markdown("Developed by **Cherilyn Marie Deocampo**")