AsyncBuilds commited on
Commit
3d0b285
·
verified ·
1 Parent(s): 817a320

Added main file

Browse files
Files changed (1) hide show
  1. src/app.py +213 -0
src/app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from io import BytesIO
3
+
4
+ import pandas as pd
5
+ import streamlit as st
6
+ from gliner2 import GLiNER2
7
+
8
+ # ---------------------------------------------------------------------------
9
+ # Constants
10
+ # ---------------------------------------------------------------------------
11
+
12
+ PERSONAL_FIELDS = [
13
+ "Person Name", "Email Address", "Phone Number",
14
+ "Street Address", "City", "Country", "Date of Birth",
15
+ ]
16
+ PROFESSIONAL_FIELDS = [
17
+ "Company Name", "Department", "Job Title",
18
+ "Office Location", "Employee ID", "Skills", "University",
19
+ ]
20
+ BUSINESS_FIELDS = [
21
+ "Counterparty", "Contract Value", "Effective Date", "Jurisdiction",
22
+ "Governing Law", "Invoice Number", "Product Name", "Project Name",
23
+ ]
24
+ ALL_PREDEFINED_FIELDS = PERSONAL_FIELDS + PROFESSIONAL_FIELDS + BUSINESS_FIELDS
25
+
26
+ MODEL_ID = "fastino/gliner2-base-v1"
27
+ EXTRACTION_THRESHOLD = 0.4
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Page config & styles
31
+ # ---------------------------------------------------------------------------
32
+
33
+ st.set_page_config(
34
+ page_title="AI Excel Entity Extractor",
35
+ page_icon="🔍",
36
+ layout="centered",
37
+ )
38
+
39
+ st.html("""
40
+ <style>
41
+ .stApp { background-color: #fcfcfc; }
42
+ div.stButton > button:first-child {
43
+ width: 100%;
44
+ border-radius: 8px;
45
+ height: 3.5em;
46
+ background-color: #2563eb;
47
+ color: white;
48
+ font-weight: bold;
49
+ border: none;
50
+ }
51
+ div.stButton > button:hover { background-color: #1d4ed8; border: none; }
52
+ .footer { text-align: center; color: #64748b; font-size: 0.85rem; margin-top: 50px; }
53
+ </style>
54
+ """)
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Cached resources & helpers
58
+ # ---------------------------------------------------------------------------
59
+
60
+ @st.cache_resource(show_spinner="Loading AI model…")
61
+ def load_model() -> GLiNER2:
62
+ return GLiNER2.from_pretrained(MODEL_ID)
63
+
64
+
65
+ @st.cache_data(show_spinner=False)
66
+ def load_excel(file) -> pd.DataFrame:
67
+ return pd.read_excel(file)
68
+
69
+
70
+ def to_excel_bytes(df: pd.DataFrame) -> bytes:
71
+ buf = BytesIO()
72
+ with pd.ExcelWriter(buf, engine="openpyxl") as writer:
73
+ df.to_excel(writer, index=False)
74
+ return buf.getvalue()
75
+
76
+
77
+ def parse_custom_labels(raw: str) -> list[str]:
78
+ return [c.strip() for c in raw.split(",") if c.strip()]
79
+
80
+
81
+ def is_valid_text(value: str) -> bool:
82
+ return bool(value.strip()) and value.lower() != "nan"
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # UI - Header
86
+ # ---------------------------------------------------------------------------
87
+
88
+ st.title("🔍 AI Excel Entity Extractor")
89
+ st.markdown(
90
+ "Automatically extract specific entities like Name, Email, etc., "
91
+ "from your spreadsheet text using GLiNER2 Zero-Shot AI."
92
+ )
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Step 1: Upload
96
+ # ---------------------------------------------------------------------------
97
+
98
+ st.write("### 1. Source Data")
99
+ uploaded_file = st.file_uploader("Upload an Excel file (.xlsx)", type="xlsx")
100
+
101
+ if not uploaded_file:
102
+ st.write("### How it works")
103
+ col_a, col_b, col_c = st.columns(3)
104
+ with col_a:
105
+ st.markdown("**1. Upload**\nDrop an Excel file with a column of text (e.g., emails, descriptions, or notes).")
106
+ with col_b:
107
+ st.markdown("**2. Define**\nSelect from common entities like Names and Dates, or type your own custom fields.")
108
+ with col_c:
109
+ st.markdown("**3. Extract**\nThe AI reads every row and creates new columns for every entity it discovers.")
110
+ st.stop()
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # Step 2: Configure
114
+ # ---------------------------------------------------------------------------
115
+
116
+ df = load_excel(uploaded_file)
117
+
118
+ if df.empty:
119
+ st.error("The uploaded file appears to be empty. Please upload a file with data.")
120
+ st.stop()
121
+
122
+ row_count = len(df)
123
+
124
+ st.divider()
125
+ st.write("### 2. Configure Extraction")
126
+
127
+ with st.spinner("Loading configuration…"):
128
+ with st.container(border=True):
129
+ col_select, col_info = st.columns([2, 1])
130
+ with col_select:
131
+ text_column = st.selectbox("Select text column to analyze:", df.columns)
132
+ with col_info:
133
+ st.metric("Total Rows", f"{row_count:,}")
134
+
135
+ st.write("---")
136
+
137
+ col1, col2 = st.columns(2)
138
+ with col1:
139
+ selected_labels = st.multiselect(
140
+ "Select Fields to Extract:",
141
+ options=ALL_PREDEFINED_FIELDS,
142
+ default=["Person Name", "Company Name"],
143
+ help="Choose common entities from the library.",
144
+ )
145
+ with col2:
146
+ custom_labels_str = st.text_area(
147
+ "Custom Entities (Comma Separated):",
148
+ placeholder="e.g. Case Number, Part ID, Deadline",
149
+ help="Define unique entities specific to your data.",
150
+ )
151
+
152
+ active_labels = list(dict.fromkeys(selected_labels + parse_custom_labels(custom_labels_str)))
153
+
154
+ # ---------------------------------------------------------------------------
155
+ # Step 3: Extract
156
+ # ---------------------------------------------------------------------------
157
+
158
+ if not st.button("🚀 Extract Fields"):
159
+ st.stop()
160
+
161
+ if not active_labels:
162
+ st.warning("⚠️ Please select or define at least one entity to extract.")
163
+ st.stop()
164
+
165
+ model = load_model()
166
+ processed_df = df.copy()
167
+ for label in active_labels:
168
+ processed_df[label] = ""
169
+
170
+ status = st.empty()
171
+ progress_bar = st.progress(0)
172
+ start_time = time.time()
173
+
174
+ for i, row in processed_df.iterrows():
175
+ text = str(row[text_column])
176
+ if is_valid_text(text):
177
+ try:
178
+ results = model.extract_entities(text, active_labels, threshold=EXTRACTION_THRESHOLD)
179
+ for label, found_list in results.get("entities", {}).items():
180
+ processed_df.at[i, label] = ", ".join(found_list)
181
+ except Exception as e:
182
+ st.warning(f"Row {i + 1} skipped due to an error: {e}")
183
+
184
+ progress_bar.progress((i + 1) / row_count)
185
+ status.text(f"Extracting fields from row {i + 1} of {row_count}…")
186
+
187
+ duration = round(time.time() - start_time, 1)
188
+ progress_bar.empty()
189
+ status.empty()
190
+
191
+ st.success(f"✅ Extraction complete - {row_count:,} rows processed in {duration}s.")
192
+
193
+ st.write("### 3. Extraction Preview")
194
+ st.dataframe(processed_df.head(10), use_container_width=True)
195
+
196
+ st.download_button(
197
+ label="📥 Download Enriched Excel File",
198
+ data=to_excel_bytes(processed_df),
199
+ file_name="AI_Extracted_Report.xlsx",
200
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
201
+ )
202
+
203
+ # ---------------------------------------------------------------------------
204
+ # Footer
205
+ # ---------------------------------------------------------------------------
206
+
207
+ st.markdown("---")
208
+ st.markdown(
209
+ '<div class="footer">Powered by '
210
+ '<a href="https://github.com/fastino-ai/GLiNER2" target="_blank">GLiNER2</a>'
211
+ " • Open-source Zero-Shot Named Entity Recognition</div>",
212
+ unsafe_allow_html=True,
213
+ )