NealCaren commited on
Commit
913803c
·
verified ·
1 Parent(s): 53097ac

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ abstract-and-embeddings.json filter=lfs diff=lfs merge=lfs -text
37
+ scopus/scopus-soc-journals.csv filter=lfs diff=lfs merge=lfs -text
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Find Sociology
3
- emoji: 🦀
4
- colorFrom: red
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.28.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Find_Sociology
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.28.0
 
 
6
  ---
 
 
abstract-and-embeddings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc8e995d67cda80ec5cb6e4fdc96adca7b13c66b239f6a1f6e5c83a346dcda2e
3
+ size 267552642
app.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.9"
3
+ # dependencies = [
4
+ # "gradio",
5
+ # "sentence-transformers",
6
+ # "pandas",
7
+ # "numpy",
8
+ # "scikit-learn",
9
+ # "torch",
10
+ # "torchvision",
11
+ # "torchaudio",
12
+ # ]
13
+ # ///
14
+
15
+ # 1. Import Libraries
16
+ import gradio as gr
17
+ import pandas as pd
18
+ import numpy as np
19
+ import os
20
+ import json
21
+ import functools
22
+ import re # Import regex for parsing author names/IDs
23
+ from collections import Counter # For counting author occurrences
24
+
25
+ from sentence_transformers import SentenceTransformer
26
+ from sklearn.metrics.pairwise import cosine_similarity
27
+ import torch # Required by sentence-transformers
28
+
29
+ # 2. Constants
30
+ MODEL_NAME = 'all-MiniLM-L6-v2'
31
+ SCOPUS_FOLDER = 'scopus' # Folder containing Scopus CSV files
32
+ DATA_FILE = 'abstract-and-embeddings.json'
33
+
34
+ # --- Helper Functions ---
35
+
36
+ # Cache model loading
37
+ @functools.lru_cache(maxsize=None)
38
+ def load_model(model_name=MODEL_NAME):
39
+ """Loads the Sentence Transformer model."""
40
+ print(f"Loading Sentence Transformer model: {model_name}...")
41
+ try:
42
+ # Check for GPU availability
43
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
44
+ print(f"Using device: {device}")
45
+ model = SentenceTransformer(model_name, device=device)
46
+ print("Model loaded successfully.")
47
+ return model
48
+ except Exception as e:
49
+ print(f"Error loading model: {e}")
50
+ # Fallback to CPU if GPU loading fails unexpectedly
51
+ try:
52
+ print("Attempting to load model on CPU...")
53
+ model = SentenceTransformer(model_name, device='cpu')
54
+ print("Model loaded successfully on CPU.")
55
+ return model
56
+ except Exception as fallback_e:
57
+ print(f"Fallback CPU loading failed: {fallback_e}")
58
+ raise
59
+
60
+ # 3. Data Preprocessing
61
+ def preprocess_data(model, scopus_folder=SCOPUS_FOLDER, data_filepath=DATA_FILE):
62
+ """
63
+ Scans the scopus_folder for CSV files, reads them, generates embeddings
64
+ for new abstracts, and saves to JSON. Avoids adding duplicates based on DOI.
65
+ Includes 'Author full names'.
66
+ """
67
+ processed_data = []
68
+ processed_dois = set()
69
+ total_new_entries = 0 # Accumulator for new entries across all files
70
+
71
+ # Load existing data if available
72
+ if os.path.exists(data_filepath):
73
+ try:
74
+ with open(data_filepath, 'r', encoding='utf-8') as f:
75
+ processed_data = json.load(f)
76
+ # Extract existing DOIs
77
+ for item in processed_data:
78
+ if 'doi' in item and item['doi'] is not None: # Check for None DOI
79
+ processed_dois.add(str(item['doi']).lower())
80
+ print(f"Loaded {len(processed_data)} items from {data_filepath}. Found {len(processed_dois)} existing unique DOIs.")
81
+ except json.JSONDecodeError:
82
+ print(f"Warning: Could not decode JSON from {data_filepath}. Starting fresh.")
83
+ processed_data = [] # Reset processed_data
84
+ except Exception as e:
85
+ print(f"Warning: Error loading {data_filepath}: {e}. Starting fresh.")
86
+ processed_data = [] # Reset processed_data
87
+
88
+ # Check if Scopus folder exists
89
+ if not os.path.isdir(scopus_folder):
90
+ print(f"Error: Scopus folder not found at '{scopus_folder}'")
91
+ if not processed_data:
92
+ print("No existing data file found either. Cannot proceed.")
93
+ return []
94
+ else:
95
+ print("Proceeding with existing data from JSON file.")
96
+ return processed_data
97
+
98
+ print(f"Scanning folder '{scopus_folder}' for CSV files...")
99
+ # Iterate through files in the Scopus folder
100
+ for filename in os.listdir(scopus_folder):
101
+ # Check if the file is a CSV
102
+ if filename.lower().endswith('.csv'):
103
+ csv_filepath = os.path.join(scopus_folder, filename)
104
+ print(f"\n--- Processing file: {csv_filepath} ---")
105
+
106
+ # Read the Scopus CSV
107
+ try:
108
+ # Try reading with default UTF-8, fallback to latin1 if error
109
+ try:
110
+ df = pd.read_csv(csv_filepath, encoding='utf-8')
111
+ except UnicodeDecodeError:
112
+ print(f"UTF-8 decode failed for {filename}, trying latin1...")
113
+ df = pd.read_csv(csv_filepath, encoding='latin1')
114
+ print(f"Successfully read {len(df)} rows from {filename}.")
115
+ except Exception as e:
116
+ print(f"Error reading CSV file {filename}: {e}. Skipping this file.")
117
+ continue # Skip to the next file
118
+
119
+ # Ensure required columns exist (Added 'Author full names')
120
+ required_cols = ['DOI', 'Abstract', 'Title', 'Authors', 'Author full names', 'Year', 'Source title']
121
+ if not all(col in df.columns for col in required_cols):
122
+ print(f"Error: CSV file {filename} must contain columns: {required_cols}")
123
+ missing_cols = [col for col in required_cols if col not in df.columns]
124
+ print(f"Missing columns: {missing_cols}. Skipping this file.")
125
+ continue # Skip to the next file
126
+
127
+ file_new_entries_count = 0
128
+ # Prepare abstracts for batch embedding for this file
129
+ abstracts_to_embed = []
130
+ corresponding_rows = []
131
+
132
+ print(f"Processing rows in {filename}...")
133
+ # Ensure DOI and Abstract are strings and handle NaNs
134
+ # Also handle 'Author full names' potentially being float if all NaN
135
+ df = df.astype({'DOI': str, 'Abstract': str, 'Author full names': str})
136
+ df.dropna(subset=['DOI', 'Abstract'], inplace=True) # Keep rows even if Author full names is NaN initially
137
+
138
+ for index, row in df.iterrows():
139
+ doi = row['DOI'].lower().strip()
140
+ abstract = row['Abstract'].strip()
141
+ author_full_names = row['Author full names'] # Keep as read, handle None later
142
+
143
+ # Basic validation - skip if empty after stripping
144
+ if not doi or not abstract:
145
+ continue
146
+
147
+ # Check if DOI already processed (from JSON or previous files in this run)
148
+ if doi in processed_dois:
149
+ continue
150
+
151
+ # Add abstract and corresponding row index for batch processing
152
+ abstracts_to_embed.append(abstract)
153
+ corresponding_rows.append(row)
154
+ processed_dois.add(doi) # Add DOI here to prevent duplicates
155
+
156
+ # Generate embeddings in batches for this file
157
+ if abstracts_to_embed:
158
+ print(f"Generating embeddings for {len(abstracts_to_embed)} new abstracts from {filename}...")
159
+ try:
160
+ embeddings = model.encode(abstracts_to_embed, show_progress_bar=True, batch_size=32)
161
+ print("Embeddings generated for this batch.")
162
+
163
+ # Add new entries to processed_data
164
+ for i, row in enumerate(corresponding_rows):
165
+ embedding_list = embeddings[i].tolist()
166
+ # Ensure Author full names is stored as string or None
167
+ auth_full_names_val = str(row['Author full names']) if pd.notna(row['Author full names']) else None
168
+
169
+ new_entry = {
170
+ 'doi': str(row['DOI']).strip(),
171
+ 'title': row['Title'],
172
+ 'authors': row['Authors'], # Keep the simpler Authors field too
173
+ 'author_full_names': auth_full_names_val, # Store the detailed field
174
+ 'year': int(row['Year']) if pd.notna(row['Year']) else None,
175
+ 'source': row['Source title'],
176
+ 'abstract': row['Abstract'].strip(),
177
+ 'embedding': embedding_list
178
+ }
179
+ processed_data.append(new_entry)
180
+ file_new_entries_count += 1
181
+ total_new_entries += file_new_entries_count # Add to overall count
182
+ print(f"Added {file_new_entries_count} new entries from {filename}.")
183
+
184
+ except Exception as e:
185
+ print(f"Error during embedding generation or processing for {filename}: {e}")
186
+ # Remove DOIs added in this failed batch to allow reprocessing if needed
187
+ for row in corresponding_rows:
188
+ processed_dois.discard(str(row['DOI']).lower().strip())
189
+ else:
190
+ print(f"No new, unique abstracts found in {filename}.")
191
+
192
+ # Save updated data to JSON only if new entries were added across all files
193
+ if total_new_entries > 0:
194
+ print(f"\nTotal new entries added: {total_new_entries}. Saving updated data to {data_filepath}...")
195
+ try:
196
+ with open(data_filepath, 'w', encoding='utf-8') as f:
197
+ json.dump(processed_data, f, indent=4)
198
+ print(f"Successfully saved {len(processed_data)} total items.")
199
+ except Exception as e:
200
+ print(f"Error saving data to {data_filepath}: {e}")
201
+ else:
202
+ print("\nNo new entries added across all files.")
203
+
204
+ return processed_data
205
+
206
+ # --- Author Parsing Helper ---
207
+ def parse_author_full_names(author_string):
208
+ """Parses the 'Author full names' string into a list of (name, id) tuples."""
209
+ authors = []
210
+ if not author_string or pd.isna(author_string):
211
+ return authors
212
+ # Regex to find Name (anything before the last parenthesis) and ID (digits inside last parenthesis)
213
+ # Handles cases like "Author Name (ID)" and "Author Name" (where ID might be missing)
214
+ pattern = re.compile(r"^(.*?)\s*\((\d+)\)$")
215
+ individual_authors = author_string.split(';')
216
+ for part in individual_authors:
217
+ part = part.strip()
218
+ if not part:
219
+ continue
220
+ match = pattern.match(part)
221
+ if match:
222
+ name = match.group(1).strip()
223
+ author_id = match.group(2).strip()
224
+ authors.append({'name': name, 'id': author_id})
225
+ else:
226
+ # Handle cases without an ID - use name as ID for counting purposes? Or skip?
227
+ # For now, let's use the name itself as a pseudo-id if no numeric ID found
228
+ # This might group authors with same name but different IDs if format varies.
229
+ # A more robust approach might require cleaner data or skipping entries without IDs.
230
+ name = part
231
+ author_id = f"name_{name}" # Create a pseudo-ID based on name
232
+ authors.append({'name': name, 'id': author_id})
233
+ # print(f"Warning: Could not parse ID for author '{part}'. Using name as identifier.")
234
+ return authors
235
+
236
+
237
+ # 4. Search Logic (Modified to return Markdown results)
238
+ def find_similar_articles(input_abstract, pos_terms, neg_terms, k, model, all_data):
239
+ """
240
+ Finds k similar articles based on abstract embeddings, adjusting for terms.
241
+ Also identifies authors appearing multiple times in the results.
242
+ Returns: (repeated_authors_df, results_markdown_string, status_message)
243
+ """
244
+ # --- Initial Checks ---
245
+ empty_df = pd.DataFrame()
246
+ empty_md = "" # Empty string for Markdown return
247
+ if not input_abstract:
248
+ return empty_df, empty_md, "Please enter an abstract."
249
+ if not all_data:
250
+ return empty_df, empty_md, "Error: No article data available. Check CSV processing and JSON file."
251
+
252
+ # --- Prepare Data and Embeddings ---
253
+ try:
254
+ valid_data = [item for item in all_data if 'embedding' in item and isinstance(item['embedding'], list)]
255
+ if len(valid_data) != len(all_data):
256
+ print(f"Warning: {len(all_data) - len(valid_data)} items missing valid embeddings. Proceeding with {len(valid_data)} items.")
257
+ if not valid_data:
258
+ return empty_df, empty_md, "Error: No items with valid embeddings found."
259
+
260
+ stored_embeddings = np.array([item['embedding'] for item in valid_data])
261
+ article_metadata = valid_data
262
+ except KeyError:
263
+ return empty_df, empty_md, "Error: 'embedding' key missing in processed data. Check JSON file structure."
264
+ except Exception as e:
265
+ return empty_df, empty_md, f"Error preparing data for search: {e}"
266
+
267
+ if stored_embeddings.size == 0:
268
+ return empty_df, empty_md, "Error: No embeddings found in the data."
269
+
270
+ # --- Input Embedding Calculation ---
271
+ try:
272
+ input_embedding = model.encode([input_abstract.strip()])[0]
273
+ if pos_terms and pos_terms.strip():
274
+ pos_embedding = model.encode([pos_terms.strip()])[0]
275
+ input_embedding = input_embedding + pos_embedding
276
+ print("Adjusted embedding with positive terms.")
277
+ if neg_terms and neg_terms.strip():
278
+ neg_embedding = model.encode([neg_terms.strip()])[0]
279
+ input_embedding = input_embedding - neg_embedding
280
+ print("Adjusted embedding with negative terms.")
281
+ except Exception as e:
282
+ return empty_df, empty_md, f"Error generating embeddings for input/terms: {e}"
283
+
284
+ # --- Similarity Calculation ---
285
+ try:
286
+ similarities = cosine_similarity(input_embedding.reshape(1, -1), stored_embeddings)[0]
287
+ except Exception as e:
288
+ return empty_df, empty_md, f"Error calculating similarity: {e}"
289
+
290
+ # --- Get Top K Results ---
291
+ k = min(int(k), len(similarities))
292
+ if k <= 0:
293
+ return empty_df, empty_md, "Please select k > 0."
294
+
295
+ valid_indices = np.where(~np.isnan(similarities))[0]
296
+ if len(valid_indices) == 0:
297
+ return empty_df, empty_md, "Error: Could not compute valid similarities."
298
+
299
+ valid_similarities = similarities[valid_indices]
300
+ sorted_valid_indices_desc = np.argsort(valid_similarities)[::-1]
301
+ top_k_original_indices = valid_indices[sorted_valid_indices_desc[:k]]
302
+
303
+ # --- Format Main Results into Markdown String ---
304
+ results_markdown_parts = []
305
+ author_id_list_for_counting = [] # List to hold all author IDs from the results
306
+ author_id_to_name_map = {} # Map ID to Name
307
+
308
+ for rank, i in enumerate(top_k_original_indices):
309
+ try:
310
+ article = article_metadata[i]
311
+ similarity_score = similarities[i]
312
+ doi_val = article.get('doi')
313
+ doi_link = f"https://doi.org/{doi_val}" if doi_val else None
314
+ title = article.get('title', 'N/A')
315
+ source = article.get('source', 'N/A')
316
+ abstract = article.get('abstract', 'N/A')
317
+ author_full_names_str = article.get('author_full_names') # Get the string
318
+
319
+ # Parse authors
320
+ parsed_authors = parse_author_full_names(author_full_names_str)
321
+ author_names_only = [a['name'] for a in parsed_authors]
322
+ authors_display_str = "; ".join(author_names_only) if author_names_only else "N/A"
323
+
324
+ # Add author IDs for counting
325
+ for author_info in parsed_authors:
326
+ author_id_list_for_counting.append(author_info['id'])
327
+ if author_info['id'] not in author_id_to_name_map:
328
+ author_id_to_name_map[author_info['id']] = author_info['name']
329
+
330
+ # Format Title (linked if DOI exists)
331
+ title_md = f"**{title}**"
332
+ if doi_link:
333
+ title_md = f"**[{title}]({doi_link})**"
334
+
335
+ # Format Abstract using blockquote for indentation
336
+ # Use HTML for smaller font size within Markdown
337
+ # FIX: Perform replacements *before* inserting into f-string
338
+ if abstract:
339
+ escaped_abstract = abstract.replace("<", "&lt;").replace(">", "&gt;").replace("&", "&amp;")
340
+ formatted_abstract = escaped_abstract.replace("\n", "<br>> ")
341
+ abstract_md = f'\n> <span style="font-size: smaller;">{formatted_abstract}</span>'
342
+ else:
343
+ abstract_md = ""
344
+
345
+
346
+ # Assemble Markdown for this article
347
+ article_md = f"### Result {rank + 1} (Similarity: {similarity_score:.4f})\n" \
348
+ f"{title_md}\n" \
349
+ f"_{source}_\n" \
350
+ f"{authors_display_str}\n" \
351
+ f"{abstract_md}\n\n" \
352
+ f"---" # Separator
353
+
354
+ results_markdown_parts.append(article_md)
355
+
356
+ except IndexError:
357
+ print(f"Warning: Index {i} out of bounds for article_metadata (length {len(article_metadata)}). Skipping.")
358
+ continue
359
+ except Exception as e:
360
+ print(f"Warning: Error formatting result for index {i}: {e}. Skipping.")
361
+ continue
362
+
363
+ # Join all parts into a single Markdown string
364
+ results_markdown_string = "\n".join(results_markdown_parts)
365
+
366
+ if not results_markdown_string:
367
+ return empty_df, empty_md, "No results found matching the criteria."
368
+
369
+ # --- Calculate Repeated Authors ---
370
+ repeated_authors_list = []
371
+ if author_id_list_for_counting:
372
+ author_counts = Counter(author_id_list_for_counting)
373
+ for author_id, count in author_counts.items():
374
+ if count > 1:
375
+ author_name = author_id_to_name_map.get(author_id, f"ID: {author_id}") # Get name from map
376
+ repeated_authors_list.append({
377
+ "Author Name": author_name,
378
+ "Count": count
379
+ })
380
+
381
+ # Sort repeated authors by count descending
382
+ repeated_authors_df = pd.DataFrame(repeated_authors_list)
383
+ if not repeated_authors_df.empty:
384
+ repeated_authors_df = repeated_authors_df.sort_values(by="Count", ascending=False)
385
+
386
+ status_message = f"Found {len(top_k_original_indices)} results. {len(repeated_authors_df)} authors appear more than once."
387
+
388
+ return repeated_authors_df, results_markdown_string, status_message
389
+
390
+ # 5. Gradio Interface Creation (Modified for default k=20)
391
+ def create_gradio_app(processed_data, model):
392
+ """Creates and returns the Gradio interface."""
393
+ search_func_with_data = functools.partial(find_similar_articles, model=model, all_data=processed_data)
394
+
395
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
396
+ gr.Markdown("# Abstract-Based Article Similarity Finder")
397
+ gr.Markdown(f"Uses Sentence Embeddings (`{MODEL_NAME}`) to find similar articles based on abstracts.")
398
+ gr.Markdown(f"Data sourced from CSV files in the **`{SCOPUS_FOLDER}`** folder and stored/updated in `{DATA_FILE}`.")
399
+
400
+ with gr.Row():
401
+ with gr.Column(scale=2):
402
+ input_abstract_box = gr.Textbox(
403
+ lines=10,
404
+ label="Paste Abstract Here",
405
+ placeholder="Enter the abstract text you want to find similar articles for..."
406
+ )
407
+ with gr.Row():
408
+ pos_terms_box = gr.Textbox(label="Positive Search Terms (Optional)", placeholder="Add terms to boost relevance (e.g., specific methods, concepts)")
409
+ neg_terms_box = gr.Textbox(label="Negative Search Terms (Optional)", placeholder="Add terms to decrease relevance (e.g., unrelated topics)")
410
+
411
+ # Set default k to 20, ensure max is sufficient
412
+ max_k = max(20, min(30, len(processed_data) if processed_data else 20)) # Ensure max is at least 20
413
+ default_k = max(1, min(20, len(processed_data) if processed_data else 1)) # Default to 20 if possible
414
+ k_slider = gr.Slider(minimum=1, maximum=max_k, step=1, value=default_k, label="Number of Results (k)")
415
+ submit_button = gr.Button("Find Similar Articles", variant="primary")
416
+
417
+ with gr.Column(scale=3):
418
+ status_textbox = gr.Textbox(label="Status", interactive=False)
419
+ # DataFrame for repeated authors (remains the same)
420
+ repeated_authors_dataframe = gr.DataFrame(
421
+ label="Authors Appearing Multiple Times in Results",
422
+ headers=["Author Name", "Count"],
423
+ visible=True
424
+ )
425
+ # Changed results display to Markdown
426
+ results_markdown_display = gr.Markdown(
427
+ label="Search Results"
428
+ )
429
+
430
+ # Update outputs for the click event
431
+ submit_button.click(
432
+ fn=search_func_with_data,
433
+ inputs=[input_abstract_box, pos_terms_box, neg_terms_box, k_slider],
434
+ # Output order: repeated authors DF, results Markdown, status Textbox
435
+ outputs=[repeated_authors_dataframe, results_markdown_display, status_textbox]
436
+ )
437
+
438
+ # Update outputs for examples, setting k to 20
439
+ if processed_data and len(processed_data) > 0 and 'abstract' in processed_data[0]:
440
+ try:
441
+ example_abstract_text = processed_data[0].get('abstract', '')
442
+ if isinstance(example_abstract_text, str):
443
+ example_abstract = example_abstract_text[:500]
444
+ if len(example_abstract_text) > 500:
445
+ example_abstract += "..."
446
+ # Set example k to 20, ensuring it doesn't exceed max_k
447
+ example_k = max(1, min(20, len(processed_data) if processed_data else 1))
448
+ example_k = min(example_k, max_k) # Ensure example k doesn't exceed slider max
449
+
450
+ gr.Examples(
451
+ # Use example_k for the example
452
+ examples=[[example_abstract, "", "", example_k]],
453
+ inputs=[input_abstract_box, pos_terms_box, neg_terms_box, k_slider],
454
+ # Update outputs for examples as well
455
+ outputs=[repeated_authors_dataframe, results_markdown_display, status_textbox],
456
+ fn=search_func_with_data,
457
+ cache_examples=False
458
+ )
459
+ else:
460
+ print("Warning: Could not create example because the first abstract is not a string.")
461
+ except Exception as e:
462
+ print(f"Could not create example: {e}")
463
+
464
+ return demo
465
+
466
+
467
+ # 6. Main Execution Block
468
+ if __name__ == "__main__":
469
+ # --- Create Scopus Folder if it doesn't exist (for testing) ---
470
+ if not os.path.exists(SCOPUS_FOLDER):
471
+ print(f"Creating folder '{SCOPUS_FOLDER}' as it does not exist.")
472
+ os.makedirs(SCOPUS_FOLDER)
473
+ if os.path.exists('scopus-24.csv'): # Check for the specific file you uploaded
474
+ try:
475
+ import shutil
476
+ shutil.copy('scopus-24.csv', os.path.join(SCOPUS_FOLDER, 'scopus-24.csv'))
477
+ print(f"Copied 'scopus-24.csv' into '{SCOPUS_FOLDER}' for testing.")
478
+ except Exception as e:
479
+ print(f"Could not copy 'scopus-24.csv': {e}")
480
+ else:
481
+ print(f"Place your Scopus CSV files inside the '{SCOPUS_FOLDER}' directory.")
482
+ # --- End of folder creation ---
483
+
484
+
485
+ # Load the model once
486
+ sbert_model = load_model(MODEL_NAME)
487
+
488
+ # Preprocess data (load existing, check CSVs in folder for new)
489
+ print("--- Starting Data Preprocessing ---")
490
+ all_processed_data = preprocess_data(sbert_model)
491
+ print("--- Data Preprocessing Finished ---")
492
+ if not all_processed_data:
493
+ print(f"Warning: No data loaded or processed. Ensure CSV files exist in '{SCOPUS_FOLDER}' and are valid.")
494
+
495
+ # Create and launch the Gradio app
496
+ app = create_gradio_app(all_processed_data, sbert_model)
497
+ print("Launching Gradio app...")
498
+ app.launch()
pyproject.toml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "scopus-ab"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = []
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ sentence-transformers
3
+ pandas
4
+ numpy
5
+ scikit-learn
6
+ torch
7
+ torchvision
8
+ torchaudio
scopus/.DS_Store ADDED
Binary file (6.15 kB). View file
 
scopus/scopus-24.csv ADDED
The diff for this file is too large to render. See raw diff
 
scopus/scopus-soc-journals.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a80d8e268a4a6a2a113f7d5f2e4be6192af0ec84f9e32edb25a371c32be2c615
3
+ size 31758963
uv.lock ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ version = 1
2
+ requires-python = ">=3.10"
3
+
4
+ [[package]]
5
+ name = "scopus-ab"
6
+ version = "0.1.0"
7
+ source = { virtual = "." }