Spaces:

mk1985
/

Historical-Text-Analyser

Sleeping

App Files Files Community

mk1985 commited on Jul 19, 2025

Commit

2f5ff37

verified ·

1 Parent(s): a572172

Create app.py

Browse files

Files changed (1) hide show

app.py +275 -0

app.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# 📚 Install dependencies
+# Make sure to run this in your environment if you haven't already
+# !pip install openai anthropic google-generativeai gradio transformers torch gliner --quiet
+# ⚙️ Imports
+import openai
+import anthropic
+import google.generativeai as genai
+import gradio as gr
+from gliner import GLiNER
+import traceback
+from collections import defaultdict, Counter # Import Counter for counting
+import re
+# 🧠 Supported models and their providers
+MODEL_OPTIONS = {
+    "OpenAI (GPT-4o)": "openai",
+    "Anthropic (Claude 3 Opus)": "anthropic",
+    "Google (Gemini 1.5 Pro)": "google"
+}
+# 🔧 GLiNER Model Configuration
+GLINER_MODEL_NAME = "urchade/gliner_large-v2.1"
+# --- Load the model only once at startup ---
+try:
+    print("Loading GLiNER model... This may take a moment.")
+    gliner_model = GLiNER.from_pretrained(GLINER_MODEL_NAME)
+    print("GLiNER model loaded successfully.")
+except Exception as e:
+    print(f"FATAL ERROR: Could not load GLiNER model. The app will not be able to find entities. Error: {e}")
+    gliner_model = None
+# 🧠 Prompt for generating the research framework
+HIERARCHICAL_PROMPT_TEMPLATE = """
+You are a helpful research assistant. For the historical topic: **"{topic}"**, your job is to suggest a research framework.
+**Instructions:**
+1.  First, think of 4-6 **Conceptual Categories** that are useful for analyzing this topic (e.g., 'Forms of Protest', 'Key Demands').
+2.  For each category, list the specific **Keywords** someone could search for in a text.
+3.  **Crucial Rule for Keywords:** Use the most basic, fundamental form (e.g., `Petition`, not `Political Petition`).
+**Output Format:**
+Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its keywords.
+### Example Category 1
+- Keyword A, Keyword B, Keyword C
+### Example Category 2
+- Keyword D, Keyword E
+"""
+# 🧠 Generator Function
+def generate_from_prompt(prompt, provider, key_dict):
+    provider_id = MODEL_OPTIONS.get(provider)
+    api_key = key_dict.get(f"{provider_id}_key")
+    if not api_key:
+        raise ValueError(f"API key for {provider} not found.")
+    if provider_id == "openai":
+        client = openai.OpenAI(api_key=api_key)
+        response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0.2)
+        return response.choices[0].message.content.strip()
+    elif provider_id == "anthropic":
+        client = anthropic.Anthropic(api_key=api_key)
+        response = client.messages.create(model="claude-3-opus-20240229", max_tokens=1024, messages=[{"role": "user", "content": prompt}])
+        return response.content[0].text.strip()
+    elif provider_id == "google":
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel('gemini-1.5-pro-latest')
+        response = model.generate_content(prompt)
+        return response.text.strip()
+    return ""
+TRADITIONAL_NER_LABELS = [
+    "Person", "Organisation", "Country / City / State", "Location",
+    "Nationality or Group", "Date", "Event", "Law / Legal Document",
+    "Product", "Facility", "Work of Art", "Language", "Time", "Percentage",
+    "Money / Currency", "Quantity / Measurement", "Ordinal Number", "Cardinal Number"
+]
+MAX_CATEGORIES = 8
+with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break: break-word; }") as demo:
+    gr.Markdown("# Historical Text Analysis Tool")
+    gr.Markdown("## Step 1: Get Keyword Ideas")
+    gr.Markdown("Start by entering a topic. The AI will populate a research framework with suggested categories and keywords to guide your analysis.")
+    with gr.Row():
+        topic = gr.Textbox(label="Enter Historical Topic", placeholder="e.g., The Chartist Movement, The Protestant Reformation")
+        provider = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose AI Model")
+    with gr.Row():
+        openai_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="Required for OpenAI")
+        anthropic_key = gr.Textbox(label="Anthropic API Key", type="password", placeholder="Required for Anthropic")
+        google_key = gr.Textbox(label="Google API Key", type="password", placeholder="Required for Google")
+    generate_btn = gr.Button("Suggest Categories and Keywords", variant="primary")
+    gr.Markdown("--- \n## Step 2: Build Your Search and Analyze Text")
+    gr.Markdown("The AI's suggestions will appear below. Build your final list of keywords, then paste your text to find all the matches.")
+    gr.Markdown("### 1. Review AI-Suggested Keywords")
+    gr.Markdown("Click on a category to see its keywords. Uncheck any you do not want, or use the 'Deselect All' button for that category.")
+    dynamic_components = []
+    with gr.Column():
+        for i in range(MAX_CATEGORIES):
+            with gr.Accordion(f"Category {i+1}", visible=False) as acc:
+                with gr.Row():
+                    cg = gr.CheckboxGroup(label="Keywords", interactive=True, container=False, scale=4)
+                    deselect_btn = gr.Button("Deselect All", size="sm", scale=1, min_width=80)
+                dynamic_components.append((acc, cg, deselect_btn))
+    gr.Markdown("### 2. Include Standard Keywords (Optional)")
+    with gr.Group():
+        ner_output = gr.CheckboxGroup(choices=TRADITIONAL_NER_LABELS, value=TRADITIONAL_NER_LABELS, label="Standard Search Terms", info="Common categories like people, places, and specific organizations.")
+        deselect_ner_btn = gr.Button("Deselect All", size="sm")
+    gr.Markdown("### 3. Add Your Own Keywords (Optional)")
+    with gr.Group():
+        gr.Markdown("**Add any other keywords**")
+        custom_labels = gr.Textbox(label=None, placeholder="e.g., Technology, Weapon, Secret Society... (separated by commas)", show_label=False)
+    threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="This controls how strict the search is. Lower to find more matches (less strict). Raise for fewer, more precise matches (more strict).")
+    text_input = gr.Textbox(label="Paste Your Full Text Here for Analysis", lines=10, placeholder="Paste a historical document, an article, or a chapter...")
+    match_btn = gr.Button("Find Keywords in Text", variant="primary")
+    with gr.Tabs():
+        with gr.TabItem("Highlighted Text"):
+            matched_output = gr.HighlightedText(label="Keyword Matches", interactive=True)
+        with gr.TabItem("Detailed Results"):
+            detailed_results_output = gr.Markdown(label="List of Matches per Keyword")
+        with gr.TabItem("Debug Info"):
+            debug_output = gr.Textbox(label="Extraction Log", interactive=False, lines=8)
+    # --- Backend Functions ---
+    import os # Make sure this import is at the top of your file
+    def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
+        # This function provides instant "working..." feedback
+        yield {
+            generate_btn: gr.update(value="Generating...", interactive=False)
+        }
+        try:
+            # On Hugging Face, use secure secrets. Locally, use the text boxes.
+            key_dict = {
+                "openai_key": os.environ.get("OPENAI_API_KEY", openai_k),
+                "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k),
+                "google_key": os.environ.get("GOOGLE_API_KEY", google_k)
+            }
+            provider_id = MODEL_OPTIONS.get(provider)
+            if not topic or not provider or not key_dict.get(f"{provider_id}_key"):
+                raise gr.Error("Topic, Provider, and the correct API Key are required.")
+            prompt = HIERARCHICAL_PROMPT_TEMPLATE.format(topic=topic)
+            raw_framework = generate_from_prompt(prompt, provider, key_dict)
+            framework = defaultdict(list)
+            current_category = None
+            for line in raw_framework.split('\n'):
+                line = line.strip()
+                if line.startswith("###"):
+                    current_category = line.replace("###", "").strip()
+                elif line.startswith("-") and current_category:
+                    entities = line.replace("-", "").strip()
+                    framework[current_category].extend([e.strip() for e in entities.split(',') if e.strip()])
+            if not framework:
+                raise gr.Error("AI failed to generate categories. Please try again.")
+            updates = {}
+            categories = list(framework.items())
+            for i in range(MAX_CATEGORIES):
+                accordion_comp, checkbox_comp, button_comp = dynamic_components[i]
+                if i < len(categories):
+                    category, entities = categories[i]
+                    sorted_entities = sorted(list(set(entities)))
+                    updates[accordion_comp] = gr.update(label=category, visible=True)
+                    updates[checkbox_comp] = gr.update(choices=sorted_entities, value=sorted_entities, visible=True)
+                    updates[button_comp] = gr.update(visible=True)
+                else:
+                    updates[accordion_comp] = gr.update(visible=False)
+                    updates[checkbox_comp] = gr.update(visible=False)
+                    updates[button_comp] = gr.update(visible=False)
+            updates[generate_btn] = gr.update(value="Suggest Categories and Keywords", interactive=True)
+            yield updates
+        except Exception as e:
+            yield {generate_btn: gr.update(value="Suggest Categories and Keywords", interactive=True)}
+            raise gr.Error(str(e))
+    # --- THIS IS THE UPDATED FUNCTION ---
+    def match_entities(text, ner_labels, custom_label_text, threshold, *selected_keywords):
+        debug_info = []
+        if gliner_model is None:
+            raise gr.Error("GLiNER model failed to load at startup. Cannot analyze text. Please check the logs and restart the application.")
+        labels_to_use = set()
+        for group in selected_keywords:
+            if group: labels_to_use.update(group)
+        if ner_labels: labels_to_use.update(ner_labels)
+        custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
+        if custom: labels_to_use.update(custom)
+        final_labels = sorted(list(labels_to_use))
+        debug_info.append(f"🧠 Searching for {len(final_labels)} unique keywords.")
+        debug_info.append(f"⚙️ Confidence Threshold: {threshold}")
+        if not text or not final_labels:
+            return {"text": text, "entities": []}, "Please provide text and select keywords.", "\n".join(debug_info)
+        all_entities = []
+        chunk_size, overlap = 1000, 50
+        for i in range(0, len(text), chunk_size - overlap):
+            chunk = text[i : i + chunk_size]
+            chunk_entities = gliner_model.predict_entities(chunk, final_labels, threshold=threshold)
+            for ent in chunk_entities:
+                ent['start'] += i; ent['end'] += i
+                all_entities.append(ent)
+        unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
+        debug_info.append(f"📊 Found {len(unique_entities)} unique matches.")
+        highlighted_entities = [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
+        # --- NEW LOGIC FOR AGGREGATED, TABLE-BASED RESULTS ---
+        # 1. Count occurrences of each unique phrase (case-insensitively)
+        aggregated_matches = defaultdict(Counter)
+        original_casing_map = {} # To store the original casing of the first instance of a phrase
+        for ent in unique_entities:
+            match_text = text[ent['start']:ent['end']]
+            match_text_lower = match_text.lower()
+            aggregated_matches[ent['label']][match_text_lower] += 1
+            original_casing_map.setdefault(match_text_lower, match_text) # Store original casing
+        # 2. Build the new Markdown string with tables
+        markdown_string = ""
+        for label, counter in sorted(aggregated_matches.items()):
+            total_matches = sum(counter.values())
+            unique_phrases = len(counter)
+            markdown_string += f"### {label} (Total: {total_matches} | Unique: {unique_phrases})\n"
+            markdown_string += "| Found Phrase | Occurrences |\n"
+            markdown_string += "|--------------|-------------|\n"
+            # Sort phrases by most frequent first
+            for phrase_lower, count in counter.most_common():
+                original_phrase = original_casing_map[phrase_lower]
+                markdown_string += f"| {original_phrase} | {count} |\n"
+            markdown_string += "\n"
+        if not markdown_string:
+            markdown_string = "No keywords found. Try lowering the confidence threshold or changing keywords."
+        return {"text": text, "entities": highlighted_entities}, markdown_string, "\n".join(debug_info)
+    # --- Wire up UI events ---
+    generate_btn.click(
+        fn=handle_generate,
+        inputs=[topic, provider, openai_key, anthropic_key, google_key],
+        outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
+    )
+    def deselect_all():
+        return gr.update(value=[])
+    deselect_ner_btn.click(fn=deselect_all, inputs=None, outputs=[ner_output])
+    for _, cg, btn in dynamic_components:
+        btn.click(fn=deselect_all, inputs=None, outputs=[cg])
+    match_btn.click(
+        fn=match_entities,
+        inputs=[text_input, ner_output, custom_labels, threshold_slider] + [cg for acc, cg, btn in dynamic_components],
+        outputs=[matched_output, detailed_results_output, debug_output]
+    )
+demo.launch(share=True, debug=True)