AsyncBuilds commited on
Commit
6b23bf7
·
verified ·
1 Parent(s): 7953b99

Removed extra file

Browse files
Files changed (1) hide show
  1. app.py +0 -213
app.py DELETED
@@ -1,213 +0,0 @@
1
- import time
2
- from io import BytesIO
3
-
4
- import pandas as pd
5
- import streamlit as st
6
- from gliner2 import GLiNER2
7
-
8
- # ---------------------------------------------------------------------------
9
- # Constants
10
- # ---------------------------------------------------------------------------
11
-
12
- PERSONAL_FIELDS = [
13
- "Person Name", "Email Address", "Phone Number",
14
- "Street Address", "City", "Country", "Date of Birth",
15
- ]
16
- PROFESSIONAL_FIELDS = [
17
- "Company Name", "Department", "Job Title",
18
- "Office Location", "Employee ID", "Skills", "University",
19
- ]
20
- BUSINESS_FIELDS = [
21
- "Counterparty", "Contract Value", "Effective Date", "Jurisdiction",
22
- "Governing Law", "Invoice Number", "Product Name", "Project Name",
23
- ]
24
- ALL_PREDEFINED_FIELDS = PERSONAL_FIELDS + PROFESSIONAL_FIELDS + BUSINESS_FIELDS
25
-
26
- MODEL_ID = "fastino/gliner2-base-v1"
27
- EXTRACTION_THRESHOLD = 0.4
28
-
29
- # ---------------------------------------------------------------------------
30
- # Page config & styles
31
- # ---------------------------------------------------------------------------
32
-
33
- st.set_page_config(
34
- page_title="AI Excel Entity Extractor",
35
- page_icon="🔍",
36
- layout="centered",
37
- )
38
-
39
- st.html("""
40
- <style>
41
- .stApp { background-color: #fcfcfc; }
42
- div.stButton > button:first-child {
43
- width: 100%;
44
- border-radius: 8px;
45
- height: 3.5em;
46
- background-color: #2563eb;
47
- color: white;
48
- font-weight: bold;
49
- border: none;
50
- }
51
- div.stButton > button:hover { background-color: #1d4ed8; border: none; }
52
- .footer { text-align: center; color: #64748b; font-size: 0.85rem; margin-top: 50px; }
53
- </style>
54
- """)
55
-
56
- # ---------------------------------------------------------------------------
57
- # Cached resources & helpers
58
- # ---------------------------------------------------------------------------
59
-
60
- @st.cache_resource(show_spinner="Loading AI model…")
61
- def load_model() -> GLiNER2:
62
- return GLiNER2.from_pretrained(MODEL_ID)
63
-
64
-
65
- @st.cache_data(show_spinner=False)
66
- def load_excel(file) -> pd.DataFrame:
67
- return pd.read_excel(file)
68
-
69
-
70
- def to_excel_bytes(df: pd.DataFrame) -> bytes:
71
- buf = BytesIO()
72
- with pd.ExcelWriter(buf, engine="openpyxl") as writer:
73
- df.to_excel(writer, index=False)
74
- return buf.getvalue()
75
-
76
-
77
- def parse_custom_labels(raw: str) -> list[str]:
78
- return [c.strip() for c in raw.split(",") if c.strip()]
79
-
80
-
81
- def is_valid_text(value: str) -> bool:
82
- return bool(value.strip()) and value.lower() != "nan"
83
-
84
- # ---------------------------------------------------------------------------
85
- # UI - Header
86
- # ---------------------------------------------------------------------------
87
-
88
- st.title("🔍 AI Excel Entity Extractor")
89
- st.markdown(
90
- "Automatically extract specific entities like Name, Email, etc., "
91
- "from your spreadsheet text using GLiNER2 Zero-Shot AI."
92
- )
93
-
94
- # ---------------------------------------------------------------------------
95
- # Step 1: Upload
96
- # ---------------------------------------------------------------------------
97
-
98
- st.write("### 1. Source Data")
99
- uploaded_file = st.file_uploader("Upload an Excel file (.xlsx)", type="xlsx")
100
-
101
- if not uploaded_file:
102
- st.write("### How it works")
103
- col_a, col_b, col_c = st.columns(3)
104
- with col_a:
105
- st.markdown("**1. Upload**\nDrop an Excel file with a column of text (e.g., emails, descriptions, or notes).")
106
- with col_b:
107
- st.markdown("**2. Define**\nSelect from common entities like Names and Dates, or type your own custom fields.")
108
- with col_c:
109
- st.markdown("**3. Extract**\nThe AI reads every row and creates new columns for every entity it discovers.")
110
- st.stop()
111
-
112
- # ---------------------------------------------------------------------------
113
- # Step 2: Configure
114
- # ---------------------------------------------------------------------------
115
-
116
- df = load_excel(uploaded_file)
117
-
118
- if df.empty:
119
- st.error("The uploaded file appears to be empty. Please upload a file with data.")
120
- st.stop()
121
-
122
- row_count = len(df)
123
-
124
- st.divider()
125
- st.write("### 2. Configure Extraction")
126
-
127
- with st.spinner("Loading configuration…"):
128
- with st.container(border=True):
129
- col_select, col_info = st.columns([2, 1])
130
- with col_select:
131
- text_column = st.selectbox("Select text column to analyze:", df.columns)
132
- with col_info:
133
- st.metric("Total Rows", f"{row_count:,}")
134
-
135
- st.write("---")
136
-
137
- col1, col2 = st.columns(2)
138
- with col1:
139
- selected_labels = st.multiselect(
140
- "Select Fields to Extract:",
141
- options=ALL_PREDEFINED_FIELDS,
142
- default=["Person Name", "Company Name"],
143
- help="Choose common entities from the library.",
144
- )
145
- with col2:
146
- custom_labels_str = st.text_area(
147
- "Custom Entities (Comma Separated):",
148
- placeholder="e.g. Case Number, Part ID, Deadline",
149
- help="Define unique entities specific to your data.",
150
- )
151
-
152
- active_labels = list(dict.fromkeys(selected_labels + parse_custom_labels(custom_labels_str)))
153
-
154
- # ---------------------------------------------------------------------------
155
- # Step 3: Extract
156
- # ---------------------------------------------------------------------------
157
-
158
- if not st.button("🚀 Extract Fields"):
159
- st.stop()
160
-
161
- if not active_labels:
162
- st.warning("⚠️ Please select or define at least one entity to extract.")
163
- st.stop()
164
-
165
- model = load_model()
166
- processed_df = df.copy()
167
- for label in active_labels:
168
- processed_df[label] = ""
169
-
170
- status = st.empty()
171
- progress_bar = st.progress(0)
172
- start_time = time.time()
173
-
174
- for i, row in processed_df.iterrows():
175
- text = str(row[text_column])
176
- if is_valid_text(text):
177
- try:
178
- results = model.extract_entities(text, active_labels, threshold=EXTRACTION_THRESHOLD)
179
- for label, found_list in results.get("entities", {}).items():
180
- processed_df.at[i, label] = ", ".join(found_list)
181
- except Exception as e:
182
- st.warning(f"Row {i + 1} skipped due to an error: {e}")
183
-
184
- progress_bar.progress((i + 1) / row_count)
185
- status.text(f"Extracting fields from row {i + 1} of {row_count}…")
186
-
187
- duration = round(time.time() - start_time, 1)
188
- progress_bar.empty()
189
- status.empty()
190
-
191
- st.success(f"✅ Extraction complete - {row_count:,} rows processed in {duration}s.")
192
-
193
- st.write("### 3. Extraction Preview")
194
- st.dataframe(processed_df.head(10), use_container_width=True)
195
-
196
- st.download_button(
197
- label="📥 Download Enriched Excel File",
198
- data=to_excel_bytes(processed_df),
199
- file_name="AI_Extracted_Report.xlsx",
200
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
201
- )
202
-
203
- # ---------------------------------------------------------------------------
204
- # Footer
205
- # ---------------------------------------------------------------------------
206
-
207
- st.markdown("---")
208
- st.markdown(
209
- '<div class="footer">Powered by '
210
- '<a href="https://github.com/fastino-ai/GLiNER2" target="_blank">GLiNER2</a>'
211
- " • Open-source Zero-Shot Named Entity Recognition</div>",
212
- unsafe_allow_html=True,
213
- )