Spaces:
Build error
Build error
| import streamlit as st | |
| from PIL import Image | |
| from pytesseract import pytesseract | |
| import PyPDF2 | |
| import enum | |
| import os | |
| # Install Tesseract at runtime | |
| if not os.path.exists("/usr/bin/tesseract"): | |
| os.system("apt-get update && apt-get install -y tesseract-ocr libtesseract-dev") | |
| import re | |
| from collections import defaultdict | |
| import folium | |
| from streamlit_folium import st_folium | |
| from geopy.geocoders import Nominatim | |
| from geopy.exc import GeocoderTimedOut | |
| import wikipedia | |
| from transformers import pipeline | |
| from openai import OpenAI | |
| # NVIDIA OpenAI API Setup | |
| client = OpenAI( | |
| base_url="https://integrate.api.nvidia.com/v1", | |
| api_key="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" # Replace with actual API key | |
| ) | |
| # Load Named Entity Recognition (NER) Model | |
| nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english") | |
| st.set_page_config(page_title="OCR & Historical Analysis", page_icon="π", layout="wide") | |
| # Custom Styling | |
| def style_text(text): | |
| return f""" | |
| <div style='padding:10px;border-radius:10px; | |
| background-color:#e0e0e0; | |
| color:#333; | |
| font-weight:500; | |
| font-size:16px;'> | |
| {text} | |
| </div> | |
| """ | |
| def find_related_documents(query): | |
| try: | |
| search_results = wikipedia.search(query, results=5) | |
| links = [wikipedia.page(result).url for result in search_results] | |
| return links | |
| except Exception as e: | |
| return [f"Error retrieving related documents: {str(e)}"] | |
| def geocode_location(location): | |
| geolocator = Nominatim(user_agent="streamlit_app") | |
| try: | |
| loc = geolocator.geocode(location, timeout=10) | |
| return (loc.latitude, loc.longitude) if loc else None | |
| except GeocoderTimedOut: | |
| return None | |
| def generate_historical_context_nvidia(text): | |
| """Use NVIDIA OpenAI API to generate a structured, summarized historical context.""" | |
| prompt_analysis = f""" | |
| Analyze the following text and provide a historical context. Identify: | |
| - Key historical events | |
| - Significant figures involved | |
| - The broader historical significance | |
| Text: {text} | |
| Provide a detailed response. | |
| """ | |
| prompt_summary = """ | |
| Summarize the historical context provided above in a concise and structured format: | |
| - Limit to 5 bullet points | |
| - Each bullet point should be under 100 words | |
| - Avoid unnecessary explanations or preambleβreturn only the summary | |
| """ | |
| try: | |
| # Step 1: Generate Detailed Historical Context | |
| completion = client.chat.completions.create( | |
| model="deepseek-ai/deepseek-r1", | |
| messages=[ | |
| {"role": "system", "content": "You are a historian providing detailed historical insights."}, | |
| {"role": "user", "content": prompt_analysis} | |
| ], | |
| temperature=0.4, | |
| top_p=0.9, | |
| max_tokens=4096, | |
| stream=False | |
| ) | |
| detailed_response = completion.choices[0].message.content.strip() | |
| # Step 2: Summarize the Historical Context **without Monologue** | |
| summary_completion = client.chat.completions.create( | |
| model="deepseek-ai/deepseek-r1", | |
| messages=[ | |
| {"role": "system", "content": "You are an expert summarizer."}, | |
| {"role": "user", "content": f"{detailed_response}\n\n{prompt_summary}"} | |
| ], | |
| temperature=0.4, | |
| top_p=0.9, | |
| max_tokens=2048, | |
| stream=False | |
| ) | |
| # Extract only the structured summary | |
| summary_response = summary_completion.choices[0].message.content.strip() | |
| # Remove AI-generated explanations or redundant preamble | |
| clean_summary = re.sub(r"^.*?\n\n", "", summary_response, flags=re.DOTALL) | |
| return clean_summary if clean_summary else "No historical context found." | |
| except Exception as e: | |
| return f"Error retrieving AI-generated historical context: {str(e)}" | |
| class OS(enum.Enum): | |
| Mac = 0 | |
| Windows = 1 | |
| class Languages(enum.Enum): | |
| English = "eng" | |
| Filipino = "fil" | |
| Spanish = "spa" | |
| class ImageReader: | |
| def __init__(self, os): | |
| if os == OS.Windows: | |
| pytesseract.tesseract_cmd = '/usr/bin/tesseract' | |
| def extract_text(self, image: Image, lang: Languages): | |
| extracted_text = pytesseract.image_to_string(image, lang=lang.value) | |
| return ' '.join(extracted_text.split()) | |
| def extract_text_from_pdf(self, pdf_file, lang: Languages): | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = "".join(page.extract_text() or "" for page in pdf_reader.pages) | |
| return text | |
| def extract_key_details(self, text): | |
| details = {"dates": set(), "names": set(), "locations": set()} | |
| date_pattern = r'\b(?:\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}|\d{4})\b' | |
| details['dates'] = set(re.findall(date_pattern, text)) | |
| entities = nlp(text) | |
| for entity in entities: | |
| if "PER" in entity['entity']: | |
| details['names'].add(entity['word']) | |
| elif "LOC" in entity['entity']: | |
| details['locations'].add(entity['word']) | |
| return details | |
| # UI Layout | |
| st.title("π OCR & Historical Context Analyzer") | |
| st.markdown("Extract text from images and PDFs, analyze named entities, and retrieve historical context.") | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| selected_os = st.selectbox("π₯οΈ Select your OS", [OS.Windows, OS.Mac], format_func=lambda x: x.name) | |
| selected_lang = st.selectbox("π Select language", list(Languages), format_func=lambda x: x.name) | |
| uploaded_file = st.file_uploader("π Upload an image or PDF", type=["png", "jpg", "jpeg", "pdf"]) | |
| if uploaded_file: | |
| ir = ImageReader(selected_os) | |
| extracted_text = "" | |
| if uploaded_file.type in ["image/png", "image/jpeg"]: | |
| image = Image.open(uploaded_file) | |
| st.image(image, caption="Uploaded Image", use_column_width=True) | |
| extracted_text = ir.extract_text(image, selected_lang) | |
| else: | |
| extracted_text = ir.extract_text_from_pdf(uploaded_file, selected_lang) | |
| st.markdown("### π Extracted Text:") | |
| st.markdown(style_text(extracted_text), unsafe_allow_html=True) | |
| key_details = ir.extract_key_details(extracted_text) | |
| st.markdown("### π Extracted Key Details") | |
| st.write(f"**π Dates:** {', '.join(key_details['dates']) if key_details['dates'] else 'None found'}") | |
| st.write(f"**π€ Names:** {', '.join(key_details['names']) if key_details['names'] else 'None found'}") | |
| st.write(f"**π Locations:** {', '.join(key_details['locations']) if key_details['locations'] else 'None found'}") | |
| combined_terms = ' '.join(key_details['dates'].union(key_details['locations']).union(key_details['names'])) | |
| historical_context = generate_historical_context_nvidia(combined_terms) | |
| st.markdown("### ποΈ Historical Context") | |
| st.markdown(style_text(historical_context), unsafe_allow_html=True) | |
| st.markdown("### π Search the Web") | |
| search_query = st.text_input("Enter a keyword or phrase:") | |
| if search_query: | |
| search_results = generate_historical_context_nvidia(search_query) | |
| st.markdown(style_text(search_results), unsafe_allow_html=True) | |
| related_docs = find_related_documents(combined_terms) | |
| st.markdown("### π Related Historical Documents") | |
| for link in related_docs: | |
| st.markdown(f"[π {link}]({link})") | |
| st.markdown("### πΊοΈ Map of Key Locations") | |
| map_center = [10.0, 10.0] | |
| map_obj = folium.Map(location=map_center, zoom_start=2) | |
| for loc in key_details['locations']: | |
| coords = geocode_location(loc) | |
| if coords: | |
| folium.Marker(coords, popup=loc).add_to(map_obj) | |
| st_folium(map_obj, width=700, height=500) |