Spaces:

JEPHONETORRE
/

HistoricalDocumentDeciphering2

Sleeping

App Files Files Community

JEPHONETORRE commited on Apr 8, 2025

Commit

11bfd4a

1 Parent(s): 7f65832

1st

Browse files

Files changed (3) hide show

.env +1 -0
app.py +190 -0
requirements.txt +10 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ AIzaSyBbzFCa84gRACICF9JrjGtonTl8UIdNOPs

app.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import os
+import streamlit as st
+import requests
+from PyPDF2 import PdfReader
+from PIL import Image
+import re
+from collections import Counter
+from streamlit_option_menu import option_menu
+import folium
+from streamlit_folium import st_folium
+from geopy.geocoders import Nominatim
+# Fetch GEMINI API key from environment variables
+gemini_api_key = os.getenv("HF_API_KEY")  # Make sure the environment variable is set correctly
+if gemini_api_key is None:
+    st.error("API key not found. Please set the GEMINI_API_KEY environment variable.")
+else:
+    # Define the URL for Gemini API
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={gemini_api_key}"
+    # Define headers for the API request
+    headers = {
+        'Content-Type': 'application/json'
+    }
+    # Function to call the Gemini API
+    def call_gemini_api(prompt):
+        data = {
+            "contents": [
+                {
+                    "parts": [
+                        {"text": prompt}
+                    ]
+                }
+            ]
+        }
+        try:
+            response = requests.post(url, json=data, headers=headers)
+            # Check if the response is successful (HTTP status 200)
+            if response.status_code == 200:
+                response_data = response.json()
+                generated_content = response_data.get('generatedContent')
+                if generated_content:
+                    return generated_content
+                else:
+                    return "No generated content found."
+            else:
+                return f"Error: {response.status_code}, {response.text}"
+        except requests.exceptions.RequestException as e:
+            return f"An error occurred: {e}"
+    # OCR and Analysis Functions
+    def extract_text_from_pdf(file):
+        pdf_reader = PdfReader(file)
+        return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])
+    def extract_text_from_image(image):
+        from pytesseract import image_to_string  # Requires pytesseract library
+        return image_to_string(image)
+    def extract_keywords(text, num_keywords=10):
+        words = re.findall(r'\b\w{4,}\b', text.lower())  # Extract words with 4+ letters
+        common_words = set("the and for with from this that have will are was were been has".split())  # Stop words
+        filtered_words = [word for word in words if word not in common_words]
+        most_common = Counter(filtered_words).most_common(num_keywords)
+        return [word for word, _ in most_common]
+    def contextualize_document(text):
+        """Generate historical context based on document text."""
+        return call_gemini_api(f"Provide historical context for the following text:\n\n{text[:1000]}")
+    def extract_locations(text):
+        """Dummy function to extract location names from text. Replace with NLP-based extraction."""
+        # For example purposes, manually returning some locations
+        return ["Manila, Philippines", "Cebu City, Philippines"]
+    def geocode_locations(locations):
+        """Geocode location names to latitude and longitude using a geocoding service."""
+        geolocator = Nominatim(user_agent="geoapi")
+        geocoded_locations = []
+        for location in locations:
+            try:
+                geo_data = geolocator.geocode(location)
+                if geo_data:
+                    geocoded_locations.append((location, geo_data.latitude, geo_data.longitude))
+            except Exception as e:
+                st.warning(f"Could not geocode location: {location}. Error: {e}")
+        return geocoded_locations
+    # Streamlit UI Setup
+    st.set_page_config(page_title="AI-Powered Historical Document Analysis", layout="wide", page_icon=":scroll:")
+    st.title("📜 AI-Powered Historical Document Deciphering and Contextualization")
+    with st.expander("📖 **What is this app about?**"):
+        st.write("""
+        The **AI-Powered Historical Document Deciphering and Contextualization** app leverages advanced AI to assist
+        historians and researchers in analyzing historical documents. It can process handwritten manuscripts, old prints, and maps
+        to extract key information, provide contextual insights, and visualize data on modern maps.
+        """)
+    # Compact Navigation
+    selected_tab = option_menu(
+        menu_title="",
+        options=["Home", "Key Points", "General Contents", "Historical Context", "Geospatial Visualization", "Human-AI Collaboration", "Knowledge Graphs"],
+        icons=["house", "key", "book", "clock", "globe", "handshake", "share-alt"],
+        menu_icon="cast",
+        default_index=0,
+        orientation="horizontal",
+    )
+    # Upload Section
+    uploaded_file = st.file_uploader("Upload an image or PDF of the historical document", type=["pdf", "png", "jpg", "jpeg"])
+    if uploaded_file:
+        file_name = uploaded_file.name  # Get the name of the uploaded file
+        st.subheader(f"Uploaded File: {file_name}")
+        if file_name.endswith(".pdf"):
+            document_text = extract_text_from_pdf(uploaded_file)
+        else:  # Image files
+            image = Image.open(uploaded_file)
+            document_text = extract_text_from_image(image)
+        st.session_state["document_text"] = document_text
+        st.success("Document uploaded and processed successfully!")
+        if selected_tab == "Home":
+            st.header("🗎 Document Overview")
+            st.write("The uploaded document has been processed. Navigate to the other tabs for detailed analysis.")
+        elif selected_tab == "Key Points":
+            st.header("🔑 Key Information")
+            keywords = extract_keywords(document_text)
+            st.write(", ".join(keywords))
+        elif selected_tab == "General Contents":
+            st.header("📜 General Contents")
+            st.text_area("Document Text", value=document_text, height=300, disabled=True)
+        elif selected_tab == "Historical Context":
+            st.header("🕰 Historical Context")
+            with st.spinner("Generating historical context..."):
+                context = contextualize_document(document_text)
+                st.markdown(context)
+        elif selected_tab == "Geospatial Visualization":
+            st.header("🌍 Geospatial Data Integration and Visualization")
+            with st.spinner("Extracting locations and preparing map..."):
+                locations = extract_locations(document_text)
+                geocoded_locations = geocode_locations(locations)
+                if geocoded_locations:
+                    m = folium.Map(location=[10.3157, 123.8854], zoom_start=6)  # Default location: Cebu, Philippines
+                    for loc, lat, lon in geocoded_locations:
+                        folium.Marker([lat, lon], popup=loc).add_to(m)
+                    st_folium(m, width=700, height=500)
+                else:
+                    st.warning("No geocoded locations available. Ensure the document contains valid location data.")
+        elif selected_tab == "Human-AI Collaboration":
+            st.header("🤝 Human-AI Collaboration")
+            corrected_text = st.text_area("Edit the extracted text below if there are OCR errors:", value=document_text, height=300)
+            if st.button("Generate Historical Insights"):
+                with st.spinner("Analyzing text for insights..."):
+                    insights = contextualize_document(corrected_text)
+                    st.markdown(insights)
+            if st.button("Generate Alternative Readings"):
+                with st.spinner("Generating alternative readings..."):
+                    alternative_readings = contextualize_document(corrected_text + "\n\nProvide alternative readings:")
+                    st.markdown(alternative_readings)
+            st.write("### Related Historical Documents")
+            st.markdown("""
+            - [Historical Archive 1](https://www.example.com/archive1)
+            - [Historical Archive 2](https://www.example.com/archive2)
+            """)
+        elif selected_tab == "Knowledge Graphs":
+            st.header("📊 Historical Context Linkage via Knowledge Graphs")
+            with st.spinner("Generating knowledge graph..."):
+                graph_data = contextualize_document(document_text)
+                st.text_area("Knowledge Graph Data", value=graph_data, height=300, disabled=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit
+PyPDF2
+pillow
+huggingface_hub
+streamlit-option-menu
+pytesseract
+folium
+streamlit-folium
+geopy
+requests