Spaces:

chiichann
/

midterm_project_historical_document_ai

Build error

App Files Files Community

midterm_project_historical_document_ai / app.py

chiichann

Upload 2 files

96b6d93 verified 9 months ago

raw

history blame contribute delete

6.12 kB

	import streamlit as st
	from PIL import Image
	import pytesseract
	import io
	import fitz # PyMuPDF
	import cv2
	import numpy as np
	import requests
	from transformers import pipeline
	from difflib import SequenceMatcher
	import folium
	from streamlit_folium import st_folium
	import wikipedia

	# Load summarization and NER pipeline
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	ner_pipeline = pipeline("ner", aggregation_strategy="simple")

	# Streamlit App
	st.set_page_config(page_title="AI Historical Document Decipher", layout="wide")
	st.title("📜 AI-powered Historical Document Deciphering App")

	st.sidebar.header("Upload Document")
	uploaded_file = st.sidebar.file_uploader("Upload Image or PDF", type=["jpg", "jpeg", "png", "pdf"])

	# Function to convert PDF to image
	def pdf_to_images(pdf_bytes):
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	images = []
	for page in doc:
	pix = page.get_pixmap()
	img = Image.open(io.BytesIO(pix.tobytes()))
	images.append(img)
	return images

	# Function to enhance image
	def enhance_image(image):
	img = np.array(image.convert('RGB'))
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Denoise
	denoised = cv2.fastNlMeansDenoising(gray, h=30)

	# Sharpening
	kernel = np.array([[0, -1, 0],
	[-1, 5,-1],
	[0, -1, 0]])
	sharpened = cv2.filter2D(denoised, -1, kernel)

	# Thresholding (binarization)
	_, binary = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

	# Optional: Resize (sometimes helps OCR)
	scale_percent = 150 # percent of original size
	width = int(binary.shape[1] * scale_percent / 100)
	height = int(binary.shape[0] * scale_percent / 100)
	resized = cv2.resize(binary, (width, height), interpolation=cv2.INTER_CUBIC)

	return resized

	# Function to perform OCR
	def perform_ocr(image):
	custom_oem_psm_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
	text = pytesseract.image_to_string(image, config=custom_oem_psm_config)
	return text

	# Function to extract named entities
	def extract_entities(text):
	entities = ner_pipeline(text)
	extracted = {}
	for ent in entities:
	label = ent['entity_group']
	extracted.setdefault(label, set()).add(ent['word'])
	return extracted

	def get_historical_context(entities):
	context = {}
	for label, values in entities.items():
	for item in values:
	try:
	summary = wikipedia.summary(item, sentences=2)
	context[item] = summary
	except wikipedia.exceptions.DisambiguationError as e:
	context[item] = f"Multiple entries found for '{item}': {e.options[:3]}"
	except wikipedia.exceptions.PageError:
	context[item] = f"No historical info found for '{item}'."
	except Exception as e:
	context[item] = f"Error retrieving info: {e}"
	return context

	# Function to correct OCR errors (suggestions)
	def suggest_corrections(original_text):
	words = original_text.split()
	suggestions = {}
	for word in words:
	if len(word) > 4 and not word.isnumeric():
	close_matches = [w for w in ["document", "historical", "archive", "event", "location"] if SequenceMatcher(None, word.lower(), w).ratio() > 0.75]
	if close_matches:
	suggestions[word] = close_matches[0]
	return suggestions

	# Function to generate map
	def generate_map(entities):
	m = folium.Map(location=[20, 0], zoom_start=2)
	if "LOC" in entities:
	for location in entities["LOC"]:
	# Dummy coordinates for demonstration
	folium.Marker(
	location=[51.5074, -0.1278], # Example: London
	popup=f"Location: {location}",
	tooltip=location
	).add_to(m)
	return m

	if uploaded_file:
	file_type = uploaded_file.type

	# Display and process the uploaded document
	if file_type == "application/pdf":
	images = pdf_to_images(uploaded_file.read())
	else:
	images = [Image.open(uploaded_file)]

	for image in images:
	st.image(image, caption="Uploaded Document", use_container_width=True)

	# Enhance image
	enhanced = enhance_image(image)
	st.image(enhanced, caption="Enhanced Image", use_container_width=True, channels="GRAY")

	# Perform OCR
	ocr_text = perform_ocr(enhanced)
	st.subheader("Extracted Text (OCR)")
	st.text_area("Text", ocr_text, height=200)

	# Suggest corrections
	corrections = suggest_corrections(ocr_text)
	if corrections:
	st.subheader("AI Suggestions for Possible Corrections")
	for original, suggestion in corrections.items():
	st.markdown(f"{original} ➔ {suggestion}")

	# Summarize text
	if len(ocr_text.strip()) > 50:
	summary = summarizer(ocr_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
	st.subheader("Summary")
	st.write(summary)

	# Extract entities
	entities = extract_entities(ocr_text)
	st.subheader("Key Information")
	for label, items in entities.items():
	st.markdown(f"{label}: {', '.join(items)}")

	# Provide historical context
	context = get_historical_context(entities)
	if context:
	st.subheader("Historical Context & Insights")
	for item, info in context.items():
	st.markdown(f"- {item}: {info}")

	# Visualize map
	st.subheader("Locations Mentioned")
	map_ = generate_map(entities)
	st_folium(map_, width=700)

	st.markdown("---")

	else:
	st.info("Upload an image or PDF of a historical document to begin.")

	st.sidebar.markdown("---")
	st.sidebar.markdown("Developed by Cherilyn Marie Deocampo")