WhaleCancer commited on
Commit
99f1718
·
1 Parent(s): 0d5b03d
Files changed (3) hide show
  1. app.py +8 -1
  2. askURL.py +4 -3
  3. processAgain.py +1 -1
app.py CHANGED
@@ -107,10 +107,17 @@ with gr.Blocks() as demo:
107
 
108
  with gr.Row():
109
  with gr.Column():
110
- btn_AttachArticles = gr.Button("🐢 Include Articles to [Cleaned Search Matrix Results]")
111
  with gr.Column():
112
  file_AttachedSearchResults = gr.File(label="🗃️ [Cleaned Search Matrix Results Including Articles]", file_types=['.tsv'], height=85)
113
 
 
 
 
 
 
 
 
114
  with gr.Row():
115
  btn_sendToAIInterrogationTab = gr.Button('Send to AI Interrogation Tab', variant='primary')
116
 
 
107
 
108
  with gr.Row():
109
  with gr.Column():
110
+ btn_AttachArticles = gr.Button("🐌 Include [Processed Search Matrix Results]")
111
  with gr.Column():
112
  file_AttachedSearchResults = gr.File(label="🗃️ [Cleaned Search Matrix Results Including Articles]", file_types=['.tsv'], height=85)
113
 
114
+ # tokenizer
115
+ with gr.Row():
116
+ with gr.Column():
117
+ gr.Button('')
118
+ with gr.Column():
119
+ gr.File('')
120
+
121
  with gr.Row():
122
  btn_sendToAIInterrogationTab = gr.Button('Send to AI Interrogation Tab', variant='primary')
123
 
askURL.py CHANGED
@@ -25,9 +25,8 @@ def get_token_length(text):
25
 
26
  return token_length
27
 
28
-
29
  def get_main_text_from_url(url):
30
- print('get_main_text_from_url...')
31
  # Define the fake header
32
  headers = {
33
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
@@ -48,10 +47,12 @@ def get_main_text_from_url(url):
48
  if main_text_element is None:
49
  main_text_element = soup.find('div', {'class': 'content-area'}) # tri-cities dispatch
50
  if main_text_element is None:
51
- main_text_element = soup.find('article')
52
  if main_text_element is None:
53
  main_text_element = soup.find('body')
54
 
 
 
55
  if main_text_element:
56
  main_text = main_text_element.text
57
  # print(main_text)
 
25
 
26
  return token_length
27
 
 
28
  def get_main_text_from_url(url):
29
+ # print('get_main_text_from_url...')
30
  # Define the fake header
31
  headers = {
32
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
47
  if main_text_element is None:
48
  main_text_element = soup.find('div', {'class': 'content-area'}) # tri-cities dispatch
49
  if main_text_element is None:
50
+ main_text_element = soup.find('article', {'class': 'article-content-story article-content-story--story'})
51
  if main_text_element is None:
52
  main_text_element = soup.find('body')
53
 
54
+
55
+
56
  if main_text_element:
57
  main_text = main_text_element.text
58
  # print(main_text)
processAgain.py CHANGED
@@ -70,7 +70,7 @@ def attach_articles(file):
70
 
71
  output_fname = 'output_2zzz.tsv'
72
 
73
- # Write the processed data to a new file
74
  with open(output_fname, 'w') as output_file:
75
  # Define the fieldnames for the output file
76
  fieldnames = list(reader.fieldnames) + ["Content"]
 
70
 
71
  output_fname = 'output_2zzz.tsv'
72
 
73
+ # Write the processed data to a new file
74
  with open(output_fname, 'w') as output_file:
75
  # Define the fieldnames for the output file
76
  fieldnames = list(reader.fieldnames) + ["Content"]