import streamlit as st from PIL import Image import pytesseract import io import fitz # PyMuPDF import cv2 import numpy as np import requests from transformers import pipeline from difflib import SequenceMatcher import folium from streamlit_folium import st_folium import wikipedia # Load summarization and NER pipeline summarizer = pipeline("summarization", model="facebook/bart-large-cnn") ner_pipeline = pipeline("ner", aggregation_strategy="simple") # Streamlit App st.set_page_config(page_title="AI Historical Document Decipher", layout="wide") st.title("📜 AI-powered Historical Document Deciphering App") st.sidebar.header("Upload Document") uploaded_file = st.sidebar.file_uploader("Upload Image or PDF", type=["jpg", "jpeg", "png", "pdf"]) # Function to convert PDF to image def pdf_to_images(pdf_bytes): doc = fitz.open(stream=pdf_bytes, filetype="pdf") images = [] for page in doc: pix = page.get_pixmap() img = Image.open(io.BytesIO(pix.tobytes())) images.append(img) return images # Function to enhance image def enhance_image(image): img = np.array(image.convert('RGB')) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Denoise denoised = cv2.fastNlMeansDenoising(gray, h=30) # Sharpening kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]]) sharpened = cv2.filter2D(denoised, -1, kernel) # Thresholding (binarization) _, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # Optional: Resize (sometimes helps OCR) scale_percent = 150 # percent of original size width = int(binary.shape[1] * scale_percent / 100) height = int(binary.shape[0] * scale_percent / 100) resized = cv2.resize(binary, (width, height), interpolation=cv2.INTER_CUBIC) return resized # Function to perform OCR def perform_ocr(image): custom_oem_psm_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1' text = pytesseract.image_to_string(image, config=custom_oem_psm_config) return text # Function to extract named entities def extract_entities(text): entities = ner_pipeline(text) extracted = {} for ent in entities: label = ent['entity_group'] extracted.setdefault(label, set()).add(ent['word']) return extracted def get_historical_context(entities): context = {} for label, values in entities.items(): for item in values: try: summary = wikipedia.summary(item, sentences=2) context[item] = summary except wikipedia.exceptions.DisambiguationError as e: context[item] = f"Multiple entries found for '{item}': {e.options[:3]}" except wikipedia.exceptions.PageError: context[item] = f"No historical info found for '{item}'." except Exception as e: context[item] = f"Error retrieving info: {e}" return context # Function to correct OCR errors (suggestions) def suggest_corrections(original_text): words = original_text.split() suggestions = {} for word in words: if len(word) > 4 and not word.isnumeric(): close_matches = [w for w in ["document", "historical", "archive", "event", "location"] if SequenceMatcher(None, word.lower(), w).ratio() > 0.75] if close_matches: suggestions[word] = close_matches[0] return suggestions # Function to generate map def generate_map(entities): m = folium.Map(location=[20, 0], zoom_start=2) if "LOC" in entities: for location in entities["LOC"]: # Dummy coordinates for demonstration folium.Marker( location=[51.5074, -0.1278], # Example: London popup=f"Location: {location}", tooltip=location ).add_to(m) return m if uploaded_file: file_type = uploaded_file.type # Display and process the uploaded document if file_type == "application/pdf": images = pdf_to_images(uploaded_file.read()) else: images = [Image.open(uploaded_file)] for image in images: st.image(image, caption="Uploaded Document", use_container_width=True) # Enhance image enhanced = enhance_image(image) st.image(enhanced, caption="Enhanced Image", use_container_width=True, channels="GRAY") # Perform OCR ocr_text = perform_ocr(enhanced) st.subheader("Extracted Text (OCR)") st.text_area("Text", ocr_text, height=200) # Suggest corrections corrections = suggest_corrections(ocr_text) if corrections: st.subheader("AI Suggestions for Possible Corrections") for original, suggestion in corrections.items(): st.markdown(f"**{original}** ➔ *{suggestion}*") # Summarize text if len(ocr_text.strip()) > 50: summary = summarizer(ocr_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text'] st.subheader("Summary") st.write(summary) # Extract entities entities = extract_entities(ocr_text) st.subheader("Key Information") for label, items in entities.items(): st.markdown(f"**{label}**: {', '.join(items)}") # Provide historical context context = get_historical_context(entities) if context: st.subheader("Historical Context & Insights") for item, info in context.items(): st.markdown(f"- **{item}**: {info}") # Visualize map st.subheader("Locations Mentioned") map_ = generate_map(entities) st_folium(map_, width=700) st.markdown("---") else: st.info("Upload an image or PDF of a historical document to begin.") st.sidebar.markdown("---") st.sidebar.markdown("Developed by **Cherilyn Marie Deocampo**")