Spaces:

JEPHONETORRE
/

HistoricalDocumentDecipheringMIDTERM

Sleeping

App Files Files Community

HistoricalDocumentDecipheringMIDTERM / app.py

JEPHONETORRE

try 2

ed63d2c 9 months ago

raw

history blame contribute delete

8.42 kB

	import os
	import streamlit as st
	import requests
	from PyPDF2 import PdfReader
	from PIL import Image
	import re
	from collections import Counter
	from streamlit_option_menu import option_menu
	import folium
	from streamlit_folium import st_folium
	from geopy.geocoders import Nominatim

	# Fetch GEMINI API key from environment variables
	gemini_api_key = os.getenv("HF_API_KEY")

	if gemini_api_key is None:
	st.error("API key not found. Please set the GEMINI_API_KEY environment variable.")
	else:
	# Define the URL for Gemini API
	url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={gemini==_api_key}"

	# Define headers for the API request
	headers = {
	'Content-Type': 'application/json'
	}

	# Function to call the Gemini API
	def call_gemini_api(prompt):
	data = {
	"contents": [
	{
	"parts": [
	{"text": prompt}
	]
	}
	]
	}

	try:
	response = requests.post(url, json=data, headers=headers)

	# Check if the response is successful (HTTP status 200)
	if response.status_code == 200:
	response_data = response.json()
	generated_content = response_data.get('generatedContent')

	if generated_content:
	return generated_content
	else:
	return "No generated content found."
	else:
	return f"Error: {response.status_code}, {response.text}"

	except requests.exceptions.RequestException as e:
	return f"An error occurred: {e}"

	# OCR and Analysis Functions
	def extract_text_from_pdf(file):
	pdf_reader = PdfReader(file)
	return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])

	def extract_text_from_image(image):
	from pytesseract import image_to_string # Requires pytesseract library
	return image_to_string(image)

	def extract_keywords(text, num_keywords=10):
	words = re.findall(r'\b\w{4,}\b', text.lower()) # Extract words with 4+ letters
	common_words = set("the and for with from this that have will are was were been has".split()) # Stop words
	filtered_words = [word for word in words if word not in common_words]
	most_common = Counter(filtered_words).most_common(num_keywords)
	return [word for word, _ in most_common]

	def contextualize_document(text):
	"""Generate historical context based on document text."""
	return call_gemini_api(f"Provide historical context for the following text:\n\n{text[:1000]}")

	def extract_locations(text):
	"""Dummy function to extract location names from text. Replace with NLP-based extraction."""
	# For example purposes, manually returning some locations
	return ["Manila, Philippines", "Cebu City, Philippines"]

	def geocode_locations(locations):
	"""Geocode location names to latitude and longitude using a geocoding service."""
	geolocator = Nominatim(user_agent="geoapi")
	geocoded_locations = []
	for location in locations:
	try:
	geo_data = geolocator.geocode(location)
	if geo_data:
	geocoded_locations.append((location, geo_data.latitude, geo_data.longitude))
	except Exception as e:
	st.warning(f"Could not geocode location: {location}. Error: {e}")
	return geocoded_locations

	# Streamlit UI Setup
	st.set_page_config(page_title="AI-Powered Historical Document Analysis", layout="wide", page_icon=":scroll:")
	st.title("📜 AI-Powered Historical Document Deciphering and Contextualization")

	with st.expander("📖 What is this app about?"):
	st.write("""
	The AI-Powered Historical Document Deciphering and Contextualization app leverages advanced AI to assist
	historians and researchers in analyzing historical documents. It can process handwritten manuscripts, old prints, and maps
	to extract key information, provide contextual insights, and visualize data on modern maps.
	""")

	# Compact Navigation
	selected_tab = option_menu(
	menu_title="",
	options=["Home", "Key Points", "General Contents", "Historical Context", "Geospatial Visualization", "Human-AI Collaboration", "Knowledge Graphs"],
	icons=["house", "key", "book", "clock", "globe", "handshake", "share-alt"],
	menu_icon="cast",
	default_index=0,
	orientation="horizontal",
	)

	# Upload Section
	uploaded_file = st.file_uploader("Upload an image or PDF of the historical document", type=["pdf", "png", "jpg", "jpeg"])

	if uploaded_file:
	file_name = uploaded_file.name # Get the name of the uploaded file
	st.subheader(f"Uploaded File: {file_name}")

	if file_name.endswith(".pdf"):
	document_text = extract_text_from_pdf(uploaded_file)
	else: # Image files
	image = Image.open(uploaded_file)
	document_text = extract_text_from_image(image)

	st.session_state["document_text"] = document_text
	st.success("Document uploaded and processed successfully!")

	if selected_tab == "Home":
	st.header("🗎 Document Overview")
	st.write("The uploaded document has been processed. Navigate to the other tabs for detailed analysis.")

	elif selected_tab == "Key Points":
	st.header("🔑 Key Information")
	keywords = extract_keywords(document_text)
	st.write(", ".join(keywords))

	elif selected_tab == "General Contents":
	st.header("📜 General Contents")
	st.text_area("Document Text", value=document_text, height=300, disabled=True)

	elif selected_tab == "Historical Context":
	st.header("🕰 Historical Context")
	with st.spinner("Generating historical context..."):
	context = contextualize_document(document_text)
	st.markdown(context)

	elif selected_tab == "Geospatial Visualization":
	st.header("🌍 Geospatial Data Integration and Visualization")
	with st.spinner("Extracting locations and preparing map..."):
	locations = extract_locations(document_text)
	geocoded_locations = geocode_locations(locations)

	if geocoded_locations:
	m = folium.Map(location=[10.3157, 123.8854], zoom_start=6) # Default location: Cebu, Philippines
	for loc, lat, lon in geocoded_locations:
	folium.Marker([lat, lon], popup=loc).add_to(m)

	st_folium(m, width=700, height=500)
	else:
	st.warning("No geocoded locations available. Ensure the document contains valid location data.")

	elif selected_tab == "Human-AI Collaboration":
	st.header("🤝 Human-AI Collaboration")
	corrected_text = st.text_area("Edit the extracted text below if there are OCR errors:", value=document_text, height=300)

	if st.button("Generate Historical Insights"):
	with st.spinner("Analyzing text for insights..."):
	insights = contextualize_document(corrected_text)
	st.markdown(insights)

	if st.button("Generate Alternative Readings"):
	with st.spinner("Generating alternative readings..."):
	alternative_readings = contextualize_document(corrected_text + "\n\nProvide alternative readings:")
	st.markdown(alternative_readings)

	st.write("### Related Historical Documents")
	st.markdown("""
	- [Historical Archive 1](https://www.example.com/archive1)
	- [Historical Archive 2](https://www.example.com/archive2)
	""")

	elif selected_tab == "Knowledge Graphs":
	st.header("📊 Historical Context Linkage via Knowledge Graphs")
	with st.spinner("Generating knowledge graph..."):
	graph_data = contextualize_document(document_text)
	st.text_area("Knowledge Graph Data", value=graph_data, height=300, disabled=True)