rubentsui commited on
Commit
a03715b
·
verified ·
1 Parent(s): 3597093

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +354 -37
src/streamlit_app.py CHANGED
@@ -1,40 +1,357 @@
1
- import altair as alt
2
- import numpy as np
 
 
 
 
3
  import pandas as pd
 
4
  import streamlit as st
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import time
4
+ from random import random
5
+ from pathlib import Path
6
+
7
  import pandas as pd
8
+ import polars as pl
9
  import streamlit as st
10
 
11
+ import requests
12
+ import cloudscraper
13
+
14
+ from bs4 import BeautifulSoup
15
+ import regex as re
16
+
17
+ import subprocess
18
+
19
+
20
+ base_url = 'https://www.taiwan-panorama.com/'
21
+ #%%
22
+
23
+ def getPage(url):
24
+
25
+ headers = {'user-agent': 'Chrome/143.0.7499.170'}
26
+ scraper = cloudscraper.create_scraper(
27
+ browser={
28
+ 'browser': 'chrome',
29
+ 'platform': 'windows',
30
+ 'mobile': False
31
+ }
32
+ )
33
+
34
+ DONE = False
35
+ MAXTRIALS = 10
36
+ cnt_fail = 0
37
+ res = None
38
+ while not DONE:
39
+ try:
40
+ #res = requests.get(url, headers=headers)
41
+ res = scraper.get(url)
42
+ except requests.exceptions.RequestException:
43
+ try:
44
+ res = requests.get(url, headers=headers)
45
+ except:
46
+ cnt_fail += 1
47
+ print(f"failed {cnt_fail} time(s)...[{url}]", flush=True)
48
+ DONE = res != None or cnt_fail > MAXTRIALS
49
+ time.sleep(5 + random()*6)
50
+ if res == None:
51
+ return None
52
+ else:
53
+ res.encoding = 'utf-8'
54
+ return res.status_code, res.text
55
+
56
+
57
+ def retrieveTWP(src_url, lang):
58
+
59
+ paras_zh = None
60
+ paras_en = None
61
+
62
+ # zh
63
+ status, html = getPage(src_url)
64
+ if status == 200:
65
+ soup = BeautifulSoup(html, 'lxml')
66
+ try:
67
+ articles = soup.find_all('article')
68
+ if articles:
69
+ paras = articles[0].find_all(('h1', 'h2', 'p'))
70
+ paras_zh = [p.text.strip() for p in paras]
71
+ paras_zh = [p for p in paras_zh if p]
72
+ except:
73
+ pass
74
+
75
+ # en
76
+ tgt_url = base_url + f"/{lang}/Articles/Details?Guid=" + getURLlang(soup, lang)
77
+ status, html = getPage(tgt_url)
78
+ if status == 200:
79
+ soup = BeautifulSoup(html, 'lxml')
80
+ try:
81
+ articles = soup.find_all('article')
82
+ if articles:
83
+ paras = articles[0].find_all(('h1', 'h2', 'p'))
84
+ paras_en = [p.text.strip() for p in paras]
85
+ paras_en = [p for p in paras_en if p]
86
+ except:
87
+ pass
88
+
89
+
90
+ return paras_zh, paras_en
91
+
92
+
93
+ def getURLlang(soup, lang):
94
+ """
95
+ Input: Parsed HTML of zh article
96
+ Output: URL of same article but in language "lang" (e.g., 'en', 'ja')
97
+ """
98
+ guid_regex = re.compile(r"Guid=([\da-z-]+?)\&")
99
+
100
+ urls = soup.find_all('a', {'href': re.compile(fr"^/{lang}/Articles/Details\?Guid=")})
101
+ if urls:
102
+ guids = guid_regex.findall(urls[0]['href'])
103
+ if guids:
104
+ return guids[0]
105
+
106
+ return None
107
+
108
+
109
+ def save_uploaded_file(uploaded_file):
110
+ """
111
+ Helper function to save an uploaded file to a temporary directory.
112
+ Returns the absolute path to the saved file.
113
+ """
114
+ try:
115
+ # Use a safe ASCII suffix
116
+ suffix = os.path.splitext(uploaded_file.name)[1]
117
+ if not suffix:
118
+ suffix = ".pdf"
119
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
120
+ tmp_file.write(uploaded_file.getvalue())
121
+ return tmp_file.name
122
+ except Exception as e:
123
+ st.error(f"Error saving file: {e}")
124
+ return None
125
+
126
+
127
+ # Import Defense Digest Processor
128
+ try:
129
+ from defense_digest import DefenseDigestProcessor
130
+ except ImportError:
131
+ DefenseDigestProcessor = None
132
+ st.error(
133
+ "Could not import DefenseDigestProcessor. Make sure dependencies are installed."
134
+ )
135
+
136
+
137
+ def processInputData(files=None, urls=None, input_type=None):
138
+ """
139
+ Dummy placeholder function to process input data.
140
+ Returns a pandas DataFrame with columns: cosine_distance, source_language, target_language.
141
+ """
142
+
143
+ # Handle Defense Digest Logic
144
+ if input_type == "Defense Digest" and DefenseDigestProcessor:
145
+ processor = DefenseDigestProcessor()
146
+ all_dfs = []
147
+
148
+ progress_bar = st.progress(0)
149
+ status_text = st.empty()
150
+ log_area = st.expander("Processing Log", expanded=True)
151
+ logs = []
152
+
153
+ def update_progress(msg):
154
+ status_text.text(msg)
155
+ logs.append(msg)
156
+ # Keep only last 20 lines for display to avoid clutter, or show all in expander
157
+ log_area.code("\n".join(logs[-20:]))
158
+
159
+ if files:
160
+ for i, uploaded_file in enumerate(files):
161
+ saved_path = save_uploaded_file(uploaded_file)
162
+ if saved_path:
163
+ update_progress(f"Processing {uploaded_file.name}...")
164
+ try:
165
+ # Save intermediate files to current directory
166
+ # Pass original filename as display_name
167
+ df = processor.process_pdf(
168
+ saved_path,
169
+ output_dir=os.getcwd(),
170
+ progress_callback=update_progress,
171
+ display_name=uploaded_file.name,
172
+ )
173
+ if df is not None and not df.empty:
174
+ all_dfs.append(df)
175
+ except Exception as e:
176
+ st.error(f"Error processing {uploaded_file.name}: {e}")
177
+ finally:
178
+ # Cleanup temp file
179
+ try:
180
+ # os.remove(saved_path) # Commented out for debugging
181
+ pass
182
+ except:
183
+ pass
184
+ progress_bar.progress((i + 1) / len(files))
185
+
186
+
187
+ if all_dfs:
188
+ return pd.concat(all_dfs, ignore_index=True)
189
+ else:
190
+ return pd.DataFrame(
191
+ columns=["cosine_distance", "source_language", "target_language"]
192
+ )
193
+
194
+ elif input_type == "Taiwan Panorama":
195
+
196
+ src_url = urls[0]
197
+ tgt_url = urls[1]
198
+
199
+ lang = 'en' # target language
200
+
201
+ st.success("Retrieving Panorama pages...")
202
+ paras_zh, paras_en = retrieveTWP(src_url, lang)
203
+ st.success("Completed retrieval of Panorama pages...")
204
+ fon_src = f'{paras_zh[0]}.zh.txt'
205
+ fon_tgt = f'{paras_zh[0]}.{lang}.txt'
206
+
207
+ with open(fon_src, 'w', encoding='utf-8', newline='\n') as fo:
208
+ fo.write('\n'.join(paras_zh) + '\n')
209
+ st.success(f"Written source file: {fon_src}")
210
+
211
+ with open(fon_tgt, 'w', encoding='utf-8', newline='\n') as fo:
212
+ fo.write('\n'.join(paras_en) + '\n')
213
+ st.success(f"Written target file: {fon_tgt}")
214
+
215
+ st.success(f"Source URL: {src_url}")
216
+ st.success(f"Target URL: {tgt_url}")
217
+
218
+ start_time = time.perf_counter()
219
+ st.success("Begin aligning bitext...")
220
+ fin = paras_zh[0]
221
+ result = subprocess.run(['python', 'alignGenericGGUF.py', fin], capture_output=True, text=True)
222
+ end_time = time.perf_counter()
223
+ duration = end_time - start_time
224
+ st.success(f"Done aligning bitext in: {duration:.2f} seconds")
225
+ print(result.stdout)
226
+ print(result.stderr)
227
+
228
+
229
+
230
+ elif input_type == "Scientific American Taiwan":
231
+
232
+ pass
233
+
234
+ # Default Dummy Logic for other types or if processor fails
235
+
236
+ # Example: Save files to disk for processing
237
+ file_paths = []
238
+ if files:
239
+ for uploaded_file in files:
240
+ # Option 1: Process directly from memory (if library supports it)
241
+ # pdf_reader = PyPDF2.PdfReader(uploaded_file)
242
+
243
+ # Option 2: Save to disk (common for many libraries)
244
+ saved_path = save_uploaded_file(uploaded_file)
245
+ if saved_path:
246
+ file_paths.append(saved_path)
247
+ # st.write(f"Saved {uploaded_file.name} to {saved_path}") # Debug info
248
+
249
+
250
+
251
+ # Dummy data
252
+ data = {
253
+ "cosine_distance": [0.1, 0.2, 0.05, 0.3],
254
+ "source_language": [
255
+ "This is a sentence.",
256
+ "Another sentence.",
257
+ "Hello world.",
258
+ "Testing.",
259
+ ],
260
+ "target_language": [
261
+ "C'est une phrase.",
262
+ "Une autre phrase.",
263
+ "Bonjour le monde.",
264
+ "Test.",
265
+ ],
266
+ }
267
+
268
+ aligned_files = list(Path('./').rglob(f"{fin}.vecalign*.txt"))
269
+ if aligned_files:
270
+ df = pl.read_csv(
271
+ source=aligned_files[0],
272
+ separator='\t',
273
+ has_header=True,
274
+ null_values='',
275
+ )
276
+ else:
277
+ df = pd.DataFrame(data)
278
+
279
+ return df
280
+
281
+ def main():
282
+ st.set_page_config(page_title="Bitext Aligner", layout="wide")
283
+
284
+ st.title("Bitext Alignment Tool")
285
+
286
+ st.markdown("""
287
+ <style>
288
+ table {
289
+ width: 100%;
290
+ }
291
+ th:nth-child(1) { width: 50px; }
292
+ th:nth-child(2) { width: 80px; }
293
+ th:nth-child(3) { width: 100px; }
294
+ th:nth-child(4) { width: 500px; }
295
+ th:nth-child(5) { width: 100px; }
296
+ th:nth-child(6) { width: 500px; }
297
+ td {
298
+ word-wrap: break-word;
299
+ min-width: 50px;
300
+ max-width: 400px;
301
+ white-space: normal !important;
302
+ }
303
+ </style>
304
+ """, unsafe_allow_html=True)
305
+
306
+ # Sidebar for inputs
307
+ with st.sidebar:
308
+ st.header("Input Settings")
309
+
310
+ # 3. Nature of input info
311
+ input_type = st.radio(
312
+ "Select Input Nature:",
313
+ ("Defense Digest", "Taiwan Panorama", "Scientific American Taiwan"),
314
+ )
315
+
316
+ st.divider()
317
+
318
+ # 1. File Upload (PDFs)
319
+ st.subheader("Upload PDFs")
320
+ uploaded_files = st.file_uploader(
321
+ "Upload one or two PDF files", type=["pdf"], accept_multiple_files=True
322
+ )
323
+
324
+ st.divider()
325
+
326
+ # 2. URLs
327
+ st.subheader("Enter URLs")
328
+ url1 = st.text_input("URL 1 (Source)")
329
+ url2 = st.text_input("URL 2 (Target)")
330
+
331
+ process_btn = st.button("Process")
332
+
333
+ # Main area
334
+ if process_btn:
335
+ if not uploaded_files and not (url1 and url2):
336
+ st.warning("Please upload files or provide a pair of URLs.")
337
+ else:
338
+ with st.spinner("Processing..."):
339
+ # Call dummy business logic
340
+ df = processInputData(
341
+ files=uploaded_files,
342
+ urls=(url1, url2) if url1 and url2 else None,
343
+ input_type=input_type,
344
+ )
345
+
346
+ st.success("Processing Complete!")
347
+
348
+ # 4. Display data in text grid
349
+ #st.dataframe(df, width="stretch")
350
+ st.table(df)
351
+
352
+ # Option to download as Excel (implied by requirement to create Excel file)
353
+ # For now, we just show the dataframe as requested.
354
+
355
+ #%%
356
+ if __name__ == "__main__":
357
+ main()