Spaces:

Tzetha
/

Midterm_Project

Build error

App Files Files Community

Midterm_Project / app.py

Tzetha

Update app.py

ca6f1c1 verified 9 months ago

raw

history blame contribute delete

7.9 kB

	import streamlit as st
	from PIL import Image
	from pytesseract import pytesseract
	import PyPDF2
	import enum
	import os
	# Install Tesseract at runtime
	if not os.path.exists("/usr/bin/tesseract"):
	os.system("apt-get update && apt-get install -y tesseract-ocr libtesseract-dev")

	import re
	from collections import defaultdict
	import folium
	from streamlit_folium import st_folium
	from geopy.geocoders import Nominatim
	from geopy.exc import GeocoderTimedOut
	import wikipedia
	from transformers import pipeline
	from openai import OpenAI

	# NVIDIA OpenAI API Setup
	client = OpenAI(
	base_url="https://integrate.api.nvidia.com/v1",
	api_key="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" # Replace with actual API key
	)

	# Load Named Entity Recognition (NER) Model
	nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

	st.set_page_config(page_title="OCR & Historical Analysis", page_icon="📜", layout="wide")

	# Custom Styling
	def style_text(text):
	return f"""
	<div style='padding:10px;border-radius:10px;
	background-color:#e0e0e0;
	color:#333;
	font-weight:500;
	font-size:16px;'>
	{text}
	</div>
	"""

	def find_related_documents(query):
	try:
	search_results = wikipedia.search(query, results=5)
	links = [wikipedia.page(result).url for result in search_results]
	return links
	except Exception as e:
	return [f"Error retrieving related documents: {str(e)}"]

	def geocode_location(location):
	geolocator = Nominatim(user_agent="streamlit_app")
	try:
	loc = geolocator.geocode(location, timeout=10)
	return (loc.latitude, loc.longitude) if loc else None
	except GeocoderTimedOut:
	return None

	def generate_historical_context_nvidia(text):
	"""Use NVIDIA OpenAI API to generate a structured, summarized historical context."""

	prompt_analysis = f"""
	Analyze the following text and provide a historical context. Identify:
	- Key historical events
	- Significant figures involved
	- The broader historical significance

	Text: {text}

	Provide a detailed response.
	"""

	prompt_summary = """
	Summarize the historical context provided above in a concise and structured format:
	- Limit to 5 bullet points
	- Each bullet point should be under 100 words
	- Avoid unnecessary explanations or preamble—return only the summary
	"""

	try:
	# Step 1: Generate Detailed Historical Context
	completion = client.chat.completions.create(
	model="deepseek-ai/deepseek-r1",
	messages=[
	{"role": "system", "content": "You are a historian providing detailed historical insights."},
	{"role": "user", "content": prompt_analysis}
	],
	temperature=0.4,
	top_p=0.9,
	max_tokens=4096,
	stream=False
	)
	detailed_response = completion.choices[0].message.content.strip()

	# Step 2: Summarize the Historical Context without Monologue
	summary_completion = client.chat.completions.create(
	model="deepseek-ai/deepseek-r1",
	messages=[
	{"role": "system", "content": "You are an expert summarizer."},
	{"role": "user", "content": f"{detailed_response}\n\n{prompt_summary}"}
	],
	temperature=0.4,
	top_p=0.9,
	max_tokens=2048,
	stream=False
	)

	# Extract only the structured summary
	summary_response = summary_completion.choices[0].message.content.strip()

	# Remove AI-generated explanations or redundant preamble
	clean_summary = re.sub(r"^.*?\n\n", "", summary_response, flags=re.DOTALL)

	return clean_summary if clean_summary else "No historical context found."

	except Exception as e:
	return f"Error retrieving AI-generated historical context: {str(e)}"


	class OS(enum.Enum):
	Mac = 0
	Windows = 1

	class Languages(enum.Enum):
	English = "eng"
	Filipino = "fil"
	Spanish = "spa"

	class ImageReader:
	def __init__(self, os):
	if os == OS.Windows:
	pytesseract.tesseract_cmd = '/usr/bin/tesseract'

	def extract_text(self, image: Image, lang: Languages):
	extracted_text = pytesseract.image_to_string(image, lang=lang.value)
	return ' '.join(extracted_text.split())

	def extract_text_from_pdf(self, pdf_file, lang: Languages):
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = "".join(page.extract_text() or "" for page in pdf_reader.pages)
	return text

	def extract_key_details(self, text):
	details = {"dates": set(), "names": set(), "locations": set()}
	date_pattern = r'\b(?:\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\|\d{4})\b'
	details['dates'] = set(re.findall(date_pattern, text))
	entities = nlp(text)

	for entity in entities:
	if "PER" in entity['entity']:
	details['names'].add(entity['word'])
	elif "LOC" in entity['entity']:
	details['locations'].add(entity['word'])

	return details

	# UI Layout
	st.title("📜 OCR & Historical Context Analyzer")
	st.markdown("Extract text from images and PDFs, analyze named entities, and retrieve historical context.")

	col1, col2 = st.columns([1, 2])

	with col1:
	selected_os = st.selectbox("🖥️ Select your OS", [OS.Windows, OS.Mac], format_func=lambda x: x.name)
	selected_lang = st.selectbox("🌍 Select language", list(Languages), format_func=lambda x: x.name)
	uploaded_file = st.file_uploader("📂 Upload an image or PDF", type=["png", "jpg", "jpeg", "pdf"])

	if uploaded_file:
	ir = ImageReader(selected_os)
	extracted_text = ""
	if uploaded_file.type in ["image/png", "image/jpeg"]:
	image = Image.open(uploaded_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)
	extracted_text = ir.extract_text(image, selected_lang)
	else:
	extracted_text = ir.extract_text_from_pdf(uploaded_file, selected_lang)

	st.markdown("### 📝 Extracted Text:")
	st.markdown(style_text(extracted_text), unsafe_allow_html=True)

	key_details = ir.extract_key_details(extracted_text)
	st.markdown("### 🔍 Extracted Key Details")
	st.write(f"📅 Dates: {', '.join(key_details['dates']) if key_details['dates'] else 'None found'}")
	st.write(f"👤 Names: {', '.join(key_details['names']) if key_details['names'] else 'None found'}")
	st.write(f"📍 Locations: {', '.join(key_details['locations']) if key_details['locations'] else 'None found'}")

	combined_terms = ' '.join(key_details['dates'].union(key_details['locations']).union(key_details['names']))
	historical_context = generate_historical_context_nvidia(combined_terms)
	st.markdown("### 🏛️ Historical Context")
	st.markdown(style_text(historical_context), unsafe_allow_html=True)

	st.markdown("### 🌐 Search the Web")
	search_query = st.text_input("Enter a keyword or phrase:")
	if search_query:
	search_results = generate_historical_context_nvidia(search_query)
	st.markdown(style_text(search_results), unsafe_allow_html=True)

	related_docs = find_related_documents(combined_terms)
	st.markdown("### 📚 Related Historical Documents")
	for link in related_docs:
	st.markdown(f"[🔗 {link}]({link})")

	st.markdown("### 🗺️ Map of Key Locations")
	map_center = [10.0, 10.0]
	map_obj = folium.Map(location=map_center, zoom_start=2)

	for loc in key_details['locations']:
	coords = geocode_location(loc)
	if coords:
	folium.Marker(coords, popup=loc).add_to(map_obj)

	st_folium(map_obj, width=700, height=500)