mk1985 commited on
Commit
2f5ff37
·
verified ·
1 Parent(s): a572172

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +275 -0
app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 📚 Install dependencies
2
+ # Make sure to run this in your environment if you haven't already
3
+ # !pip install openai anthropic google-generativeai gradio transformers torch gliner --quiet
4
+
5
+ # ⚙️ Imports
6
+ import openai
7
+ import anthropic
8
+ import google.generativeai as genai
9
+ import gradio as gr
10
+ from gliner import GLiNER
11
+ import traceback
12
+ from collections import defaultdict, Counter # Import Counter for counting
13
+ import re
14
+
15
+ # 🧠 Supported models and their providers
16
+ MODEL_OPTIONS = {
17
+ "OpenAI (GPT-4o)": "openai",
18
+ "Anthropic (Claude 3 Opus)": "anthropic",
19
+ "Google (Gemini 1.5 Pro)": "google"
20
+ }
21
+
22
+ # 🔧 GLiNER Model Configuration
23
+ GLINER_MODEL_NAME = "urchade/gliner_large-v2.1"
24
+
25
+ # --- Load the model only once at startup ---
26
+ try:
27
+ print("Loading GLiNER model... This may take a moment.")
28
+ gliner_model = GLiNER.from_pretrained(GLINER_MODEL_NAME)
29
+ print("GLiNER model loaded successfully.")
30
+ except Exception as e:
31
+ print(f"FATAL ERROR: Could not load GLiNER model. The app will not be able to find entities. Error: {e}")
32
+ gliner_model = None
33
+
34
+ # 🧠 Prompt for generating the research framework
35
+ HIERARCHICAL_PROMPT_TEMPLATE = """
36
+ You are a helpful research assistant. For the historical topic: **"{topic}"**, your job is to suggest a research framework.
37
+
38
+ **Instructions:**
39
+ 1. First, think of 4-6 **Conceptual Categories** that are useful for analyzing this topic (e.g., 'Forms of Protest', 'Key Demands').
40
+ 2. For each category, list the specific **Keywords** someone could search for in a text.
41
+ 3. **Crucial Rule for Keywords:** Use the most basic, fundamental form (e.g., `Petition`, not `Political Petition`).
42
+
43
+ **Output Format:**
44
+ Use Markdown. Each category must be a Level 3 Header (###), followed by a comma-separated list of its keywords.
45
+
46
+ ### Example Category 1
47
+ - Keyword A, Keyword B, Keyword C
48
+ ### Example Category 2
49
+ - Keyword D, Keyword E
50
+ """
51
+
52
+ # 🧠 Generator Function
53
+ def generate_from_prompt(prompt, provider, key_dict):
54
+ provider_id = MODEL_OPTIONS.get(provider)
55
+ api_key = key_dict.get(f"{provider_id}_key")
56
+ if not api_key:
57
+ raise ValueError(f"API key for {provider} not found.")
58
+
59
+ if provider_id == "openai":
60
+ client = openai.OpenAI(api_key=api_key)
61
+ response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0.2)
62
+ return response.choices[0].message.content.strip()
63
+ elif provider_id == "anthropic":
64
+ client = anthropic.Anthropic(api_key=api_key)
65
+ response = client.messages.create(model="claude-3-opus-20240229", max_tokens=1024, messages=[{"role": "user", "content": prompt}])
66
+ return response.content[0].text.strip()
67
+ elif provider_id == "google":
68
+ genai.configure(api_key=api_key)
69
+ model = genai.GenerativeModel('gemini-1.5-pro-latest')
70
+ response = model.generate_content(prompt)
71
+ return response.text.strip()
72
+ return ""
73
+
74
+ TRADITIONAL_NER_LABELS = [
75
+ "Person", "Organisation", "Country / City / State", "Location",
76
+ "Nationality or Group", "Date", "Event", "Law / Legal Document",
77
+ "Product", "Facility", "Work of Art", "Language", "Time", "Percentage",
78
+ "Money / Currency", "Quantity / Measurement", "Ordinal Number", "Cardinal Number"
79
+ ]
80
+
81
+ MAX_CATEGORIES = 8
82
+
83
+ with gr.Blocks(title="Historical Text Analysis Tool", css=".prose { word-break: break-word; }") as demo:
84
+ gr.Markdown("# Historical Text Analysis Tool")
85
+
86
+ gr.Markdown("## Step 1: Get Keyword Ideas")
87
+ gr.Markdown("Start by entering a topic. The AI will populate a research framework with suggested categories and keywords to guide your analysis.")
88
+ with gr.Row():
89
+ topic = gr.Textbox(label="Enter Historical Topic", placeholder="e.g., The Chartist Movement, The Protestant Reformation")
90
+ provider = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose AI Model")
91
+ with gr.Row():
92
+ openai_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="Required for OpenAI")
93
+ anthropic_key = gr.Textbox(label="Anthropic API Key", type="password", placeholder="Required for Anthropic")
94
+ google_key = gr.Textbox(label="Google API Key", type="password", placeholder="Required for Google")
95
+
96
+ generate_btn = gr.Button("Suggest Categories and Keywords", variant="primary")
97
+
98
+ gr.Markdown("--- \n## Step 2: Build Your Search and Analyze Text")
99
+ gr.Markdown("The AI's suggestions will appear below. Build your final list of keywords, then paste your text to find all the matches.")
100
+
101
+ gr.Markdown("### 1. Review AI-Suggested Keywords")
102
+ gr.Markdown("Click on a category to see its keywords. Uncheck any you do not want, or use the 'Deselect All' button for that category.")
103
+
104
+ dynamic_components = []
105
+ with gr.Column():
106
+ for i in range(MAX_CATEGORIES):
107
+ with gr.Accordion(f"Category {i+1}", visible=False) as acc:
108
+ with gr.Row():
109
+ cg = gr.CheckboxGroup(label="Keywords", interactive=True, container=False, scale=4)
110
+ deselect_btn = gr.Button("Deselect All", size="sm", scale=1, min_width=80)
111
+ dynamic_components.append((acc, cg, deselect_btn))
112
+
113
+ gr.Markdown("### 2. Include Standard Keywords (Optional)")
114
+ with gr.Group():
115
+ ner_output = gr.CheckboxGroup(choices=TRADITIONAL_NER_LABELS, value=TRADITIONAL_NER_LABELS, label="Standard Search Terms", info="Common categories like people, places, and specific organizations.")
116
+ deselect_ner_btn = gr.Button("Deselect All", size="sm")
117
+
118
+ gr.Markdown("### 3. Add Your Own Keywords (Optional)")
119
+ with gr.Group():
120
+ gr.Markdown("**Add any other keywords**")
121
+ custom_labels = gr.Textbox(label=None, placeholder="e.g., Technology, Weapon, Secret Society... (separated by commas)", show_label=False)
122
+
123
+ threshold_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Confidence Threshold", info="This controls how strict the search is. Lower to find more matches (less strict). Raise for fewer, more precise matches (more strict).")
124
+ text_input = gr.Textbox(label="Paste Your Full Text Here for Analysis", lines=10, placeholder="Paste a historical document, an article, or a chapter...")
125
+ match_btn = gr.Button("Find Keywords in Text", variant="primary")
126
+
127
+ with gr.Tabs():
128
+ with gr.TabItem("Highlighted Text"):
129
+ matched_output = gr.HighlightedText(label="Keyword Matches", interactive=True)
130
+ with gr.TabItem("Detailed Results"):
131
+ detailed_results_output = gr.Markdown(label="List of Matches per Keyword")
132
+ with gr.TabItem("Debug Info"):
133
+ debug_output = gr.Textbox(label="Extraction Log", interactive=False, lines=8)
134
+
135
+ # --- Backend Functions ---
136
+ import os # Make sure this import is at the top of your file
137
+
138
+ def handle_generate(topic, provider, openai_k, anthropic_k, google_k):
139
+ # This function provides instant "working..." feedback
140
+ yield {
141
+ generate_btn: gr.update(value="Generating...", interactive=False)
142
+ }
143
+
144
+ try:
145
+ # On Hugging Face, use secure secrets. Locally, use the text boxes.
146
+ key_dict = {
147
+ "openai_key": os.environ.get("OPENAI_API_KEY", openai_k),
148
+ "anthropic_key": os.environ.get("ANTHROPIC_API_KEY", anthropic_k),
149
+ "google_key": os.environ.get("GOOGLE_API_KEY", google_k)
150
+ }
151
+
152
+ provider_id = MODEL_OPTIONS.get(provider)
153
+ if not topic or not provider or not key_dict.get(f"{provider_id}_key"):
154
+ raise gr.Error("Topic, Provider, and the correct API Key are required.")
155
+
156
+ prompt = HIERARCHICAL_PROMPT_TEMPLATE.format(topic=topic)
157
+ raw_framework = generate_from_prompt(prompt, provider, key_dict)
158
+ framework = defaultdict(list)
159
+ current_category = None
160
+ for line in raw_framework.split('\n'):
161
+ line = line.strip()
162
+ if line.startswith("###"):
163
+ current_category = line.replace("###", "").strip()
164
+ elif line.startswith("-") and current_category:
165
+ entities = line.replace("-", "").strip()
166
+ framework[current_category].extend([e.strip() for e in entities.split(',') if e.strip()])
167
+ if not framework:
168
+ raise gr.Error("AI failed to generate categories. Please try again.")
169
+
170
+ updates = {}
171
+ categories = list(framework.items())
172
+ for i in range(MAX_CATEGORIES):
173
+ accordion_comp, checkbox_comp, button_comp = dynamic_components[i]
174
+ if i < len(categories):
175
+ category, entities = categories[i]
176
+ sorted_entities = sorted(list(set(entities)))
177
+ updates[accordion_comp] = gr.update(label=category, visible=True)
178
+ updates[checkbox_comp] = gr.update(choices=sorted_entities, value=sorted_entities, visible=True)
179
+ updates[button_comp] = gr.update(visible=True)
180
+ else:
181
+ updates[accordion_comp] = gr.update(visible=False)
182
+ updates[checkbox_comp] = gr.update(visible=False)
183
+ updates[button_comp] = gr.update(visible=False)
184
+ updates[generate_btn] = gr.update(value="Suggest Categories and Keywords", interactive=True)
185
+ yield updates
186
+ except Exception as e:
187
+ yield {generate_btn: gr.update(value="Suggest Categories and Keywords", interactive=True)}
188
+ raise gr.Error(str(e))
189
+
190
+ # --- THIS IS THE UPDATED FUNCTION ---
191
+ def match_entities(text, ner_labels, custom_label_text, threshold, *selected_keywords):
192
+ debug_info = []
193
+ if gliner_model is None:
194
+ raise gr.Error("GLiNER model failed to load at startup. Cannot analyze text. Please check the logs and restart the application.")
195
+
196
+ labels_to_use = set()
197
+ for group in selected_keywords:
198
+ if group: labels_to_use.update(group)
199
+ if ner_labels: labels_to_use.update(ner_labels)
200
+ custom = {l.strip() for l in custom_label_text.split(',') if l.strip()}
201
+ if custom: labels_to_use.update(custom)
202
+
203
+ final_labels = sorted(list(labels_to_use))
204
+ debug_info.append(f"🧠 Searching for {len(final_labels)} unique keywords.")
205
+ debug_info.append(f"⚙️ Confidence Threshold: {threshold}")
206
+
207
+ if not text or not final_labels:
208
+ return {"text": text, "entities": []}, "Please provide text and select keywords.", "\n".join(debug_info)
209
+
210
+ all_entities = []
211
+ chunk_size, overlap = 1000, 50
212
+ for i in range(0, len(text), chunk_size - overlap):
213
+ chunk = text[i : i + chunk_size]
214
+ chunk_entities = gliner_model.predict_entities(chunk, final_labels, threshold=threshold)
215
+ for ent in chunk_entities:
216
+ ent['start'] += i; ent['end'] += i
217
+ all_entities.append(ent)
218
+
219
+ unique_entities = [dict(t) for t in {tuple(d.items()) for d in all_entities}]
220
+ debug_info.append(f"📊 Found {len(unique_entities)} unique matches.")
221
+
222
+ highlighted_entities = [{"start": ent["start"], "end": ent["end"], "entity": ent["label"]} for ent in unique_entities]
223
+
224
+ # --- NEW LOGIC FOR AGGREGATED, TABLE-BASED RESULTS ---
225
+ # 1. Count occurrences of each unique phrase (case-insensitively)
226
+ aggregated_matches = defaultdict(Counter)
227
+ original_casing_map = {} # To store the original casing of the first instance of a phrase
228
+
229
+ for ent in unique_entities:
230
+ match_text = text[ent['start']:ent['end']]
231
+ match_text_lower = match_text.lower()
232
+
233
+ aggregated_matches[ent['label']][match_text_lower] += 1
234
+ original_casing_map.setdefault(match_text_lower, match_text) # Store original casing
235
+
236
+ # 2. Build the new Markdown string with tables
237
+ markdown_string = ""
238
+ for label, counter in sorted(aggregated_matches.items()):
239
+ total_matches = sum(counter.values())
240
+ unique_phrases = len(counter)
241
+ markdown_string += f"### {label} (Total: {total_matches} | Unique: {unique_phrases})\n"
242
+ markdown_string += "| Found Phrase | Occurrences |\n"
243
+ markdown_string += "|--------------|-------------|\n"
244
+
245
+ # Sort phrases by most frequent first
246
+ for phrase_lower, count in counter.most_common():
247
+ original_phrase = original_casing_map[phrase_lower]
248
+ markdown_string += f"| {original_phrase} | {count} |\n"
249
+ markdown_string += "\n"
250
+
251
+ if not markdown_string:
252
+ markdown_string = "No keywords found. Try lowering the confidence threshold or changing keywords."
253
+
254
+ return {"text": text, "entities": highlighted_entities}, markdown_string, "\n".join(debug_info)
255
+
256
+ # --- Wire up UI events ---
257
+ generate_btn.click(
258
+ fn=handle_generate,
259
+ inputs=[topic, provider, openai_key, anthropic_key, google_key],
260
+ outputs=[generate_btn] + [comp for pair in dynamic_components for comp in pair]
261
+ )
262
+
263
+ def deselect_all():
264
+ return gr.update(value=[])
265
+ deselect_ner_btn.click(fn=deselect_all, inputs=None, outputs=[ner_output])
266
+ for _, cg, btn in dynamic_components:
267
+ btn.click(fn=deselect_all, inputs=None, outputs=[cg])
268
+
269
+ match_btn.click(
270
+ fn=match_entities,
271
+ inputs=[text_input, ner_output, custom_labels, threshold_slider] + [cg for acc, cg, btn in dynamic_components],
272
+ outputs=[matched_output, detailed_results_output, debug_output]
273
+ )
274
+
275
+ demo.launch(share=True, debug=True)