|
|
import os |
|
|
import streamlit as st |
|
|
import requests |
|
|
from PyPDF2 import PdfReader |
|
|
from PIL import Image |
|
|
import re |
|
|
from collections import Counter |
|
|
from streamlit_option_menu import option_menu |
|
|
import folium |
|
|
from streamlit_folium import st_folium |
|
|
from geopy.geocoders import Nominatim |
|
|
|
|
|
|
|
|
gemini_api_key = os.getenv("HF_API_KEY") |
|
|
|
|
|
if gemini_api_key is None: |
|
|
st.error("API key not found. Please set the GEMINI_API_KEY environment variable.") |
|
|
else: |
|
|
|
|
|
url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={gemini==_api_key}" |
|
|
|
|
|
|
|
|
headers = { |
|
|
'Content-Type': 'application/json' |
|
|
} |
|
|
|
|
|
|
|
|
def call_gemini_api(prompt): |
|
|
data = { |
|
|
"contents": [ |
|
|
{ |
|
|
"parts": [ |
|
|
{"text": prompt} |
|
|
] |
|
|
} |
|
|
] |
|
|
} |
|
|
|
|
|
try: |
|
|
response = requests.post(url, json=data, headers=headers) |
|
|
|
|
|
|
|
|
if response.status_code == 200: |
|
|
response_data = response.json() |
|
|
generated_content = response_data.get('generatedContent') |
|
|
|
|
|
if generated_content: |
|
|
return generated_content |
|
|
else: |
|
|
return "No generated content found." |
|
|
else: |
|
|
return f"Error: {response.status_code}, {response.text}" |
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
return f"An error occurred: {e}" |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(file): |
|
|
pdf_reader = PdfReader(file) |
|
|
return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]) |
|
|
|
|
|
def extract_text_from_image(image): |
|
|
from pytesseract import image_to_string |
|
|
return image_to_string(image) |
|
|
|
|
|
def extract_keywords(text, num_keywords=10): |
|
|
words = re.findall(r'\b\w{4,}\b', text.lower()) |
|
|
common_words = set("the and for with from this that have will are was were been has".split()) |
|
|
filtered_words = [word for word in words if word not in common_words] |
|
|
most_common = Counter(filtered_words).most_common(num_keywords) |
|
|
return [word for word, _ in most_common] |
|
|
|
|
|
def contextualize_document(text): |
|
|
"""Generate historical context based on document text.""" |
|
|
return call_gemini_api(f"Provide historical context for the following text:\n\n{text[:1000]}") |
|
|
|
|
|
def extract_locations(text): |
|
|
"""Dummy function to extract location names from text. Replace with NLP-based extraction.""" |
|
|
|
|
|
return ["Manila, Philippines", "Cebu City, Philippines"] |
|
|
|
|
|
def geocode_locations(locations): |
|
|
"""Geocode location names to latitude and longitude using a geocoding service.""" |
|
|
geolocator = Nominatim(user_agent="geoapi") |
|
|
geocoded_locations = [] |
|
|
for location in locations: |
|
|
try: |
|
|
geo_data = geolocator.geocode(location) |
|
|
if geo_data: |
|
|
geocoded_locations.append((location, geo_data.latitude, geo_data.longitude)) |
|
|
except Exception as e: |
|
|
st.warning(f"Could not geocode location: {location}. Error: {e}") |
|
|
return geocoded_locations |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="AI-Powered Historical Document Analysis", layout="wide", page_icon=":scroll:") |
|
|
st.title("π AI-Powered Historical Document Deciphering and Contextualization") |
|
|
|
|
|
with st.expander("π **What is this app about?**"): |
|
|
st.write(""" |
|
|
The **AI-Powered Historical Document Deciphering and Contextualization** app leverages advanced AI to assist |
|
|
historians and researchers in analyzing historical documents. It can process handwritten manuscripts, old prints, and maps |
|
|
to extract key information, provide contextual insights, and visualize data on modern maps. |
|
|
""") |
|
|
|
|
|
|
|
|
selected_tab = option_menu( |
|
|
menu_title="", |
|
|
options=["Home", "Key Points", "General Contents", "Historical Context", "Geospatial Visualization", "Human-AI Collaboration", "Knowledge Graphs"], |
|
|
icons=["house", "key", "book", "clock", "globe", "handshake", "share-alt"], |
|
|
menu_icon="cast", |
|
|
default_index=0, |
|
|
orientation="horizontal", |
|
|
) |
|
|
|
|
|
|
|
|
uploaded_file = st.file_uploader("Upload an image or PDF of the historical document", type=["pdf", "png", "jpg", "jpeg"]) |
|
|
|
|
|
if uploaded_file: |
|
|
file_name = uploaded_file.name |
|
|
st.subheader(f"Uploaded File: {file_name}") |
|
|
|
|
|
if file_name.endswith(".pdf"): |
|
|
document_text = extract_text_from_pdf(uploaded_file) |
|
|
else: |
|
|
image = Image.open(uploaded_file) |
|
|
document_text = extract_text_from_image(image) |
|
|
|
|
|
st.session_state["document_text"] = document_text |
|
|
st.success("Document uploaded and processed successfully!") |
|
|
|
|
|
if selected_tab == "Home": |
|
|
st.header("π Document Overview") |
|
|
st.write("The uploaded document has been processed. Navigate to the other tabs for detailed analysis.") |
|
|
|
|
|
elif selected_tab == "Key Points": |
|
|
st.header("π Key Information") |
|
|
keywords = extract_keywords(document_text) |
|
|
st.write(", ".join(keywords)) |
|
|
|
|
|
elif selected_tab == "General Contents": |
|
|
st.header("π General Contents") |
|
|
st.text_area("Document Text", value=document_text, height=300, disabled=True) |
|
|
|
|
|
elif selected_tab == "Historical Context": |
|
|
st.header("π° Historical Context") |
|
|
with st.spinner("Generating historical context..."): |
|
|
context = contextualize_document(document_text) |
|
|
st.markdown(context) |
|
|
|
|
|
elif selected_tab == "Geospatial Visualization": |
|
|
st.header("π Geospatial Data Integration and Visualization") |
|
|
with st.spinner("Extracting locations and preparing map..."): |
|
|
locations = extract_locations(document_text) |
|
|
geocoded_locations = geocode_locations(locations) |
|
|
|
|
|
if geocoded_locations: |
|
|
m = folium.Map(location=[10.3157, 123.8854], zoom_start=6) |
|
|
for loc, lat, lon in geocoded_locations: |
|
|
folium.Marker([lat, lon], popup=loc).add_to(m) |
|
|
|
|
|
st_folium(m, width=700, height=500) |
|
|
else: |
|
|
st.warning("No geocoded locations available. Ensure the document contains valid location data.") |
|
|
|
|
|
elif selected_tab == "Human-AI Collaboration": |
|
|
st.header("π€ Human-AI Collaboration") |
|
|
corrected_text = st.text_area("Edit the extracted text below if there are OCR errors:", value=document_text, height=300) |
|
|
|
|
|
if st.button("Generate Historical Insights"): |
|
|
with st.spinner("Analyzing text for insights..."): |
|
|
insights = contextualize_document(corrected_text) |
|
|
st.markdown(insights) |
|
|
|
|
|
if st.button("Generate Alternative Readings"): |
|
|
with st.spinner("Generating alternative readings..."): |
|
|
alternative_readings = contextualize_document(corrected_text + "\n\nProvide alternative readings:") |
|
|
st.markdown(alternative_readings) |
|
|
|
|
|
st.write("### Related Historical Documents") |
|
|
st.markdown(""" |
|
|
- [Historical Archive 1](https://www.example.com/archive1) |
|
|
- [Historical Archive 2](https://www.example.com/archive2) |
|
|
""") |
|
|
|
|
|
elif selected_tab == "Knowledge Graphs": |
|
|
st.header("π Historical Context Linkage via Knowledge Graphs") |
|
|
with st.spinner("Generating knowledge graph..."): |
|
|
graph_data = contextualize_document(document_text) |
|
|
st.text_area("Knowledge Graph Data", value=graph_data, height=300, disabled=True) |
|
|
|