Spaces:
Sleeping
Sleeping
Update tsadropboxretrieval.py
Browse files- tsadropboxretrieval.py +20 -10
tsadropboxretrieval.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""TSADropboxRetrieval.ipynb
|
| 3 |
-
|
| 4 |
Automatically generated by Colaboratory.
|
| 5 |
-
|
| 6 |
Original file is located at
|
| 7 |
https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
|
| 8 |
"""
|
|
@@ -205,12 +203,14 @@ def GetParquetDF():
|
|
| 205 |
return df
|
| 206 |
|
| 207 |
|
| 208 |
-
def getPathtoPDF_File(nameofPDF):
|
| 209 |
parquetDf = load_parquet_df()
|
| 210 |
nameofPDF=nameofPDF.replace('"', '')
|
| 211 |
try:
|
| 212 |
# path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
|
| 213 |
path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display']
|
|
|
|
|
|
|
| 214 |
link=getSharedLink(path)
|
| 215 |
print(path,link)
|
| 216 |
except:
|
|
@@ -225,9 +225,9 @@ def getPDFData(path):
|
|
| 225 |
data = res.content
|
| 226 |
return data
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
|
| 232 |
projname = '/' + projname.split(' ')[0] # Extract main project name
|
| 233 |
projname = projname.replace('/"', '') # Remove unwanted characters
|
|
@@ -237,18 +237,28 @@ def retrieveProjects(projname):
|
|
| 237 |
documentsToMeasure = []
|
| 238 |
RelevantDocuments = []
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
# Store the original path before converting it to lowercase
|
| 241 |
parquetDf['original_path_display'] = parquetDf['path_display']
|
| 242 |
|
| 243 |
# Create a lowercase column for case-insensitive matching
|
| 244 |
parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
|
| 245 |
-
|
|
|
|
| 246 |
# Filter using the lowercase column but retrieve the original paths
|
| 247 |
mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x)
|
| 248 |
-
|
|
|
|
| 249 |
# Retrieve the original (case-sensitive) paths before lowering them
|
| 250 |
RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
|
| 251 |
documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
# Extract path from the original (case-sensitive) column
|
| 254 |
if RelevantDocuments:
|
|
@@ -258,6 +268,6 @@ def retrieveProjects(projname):
|
|
| 258 |
|
| 259 |
# Remove temporary columns
|
| 260 |
parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True)
|
| 261 |
-
|
|
|
|
| 262 |
return documentsToMeasure, RelevantDocuments, extracted_path
|
| 263 |
-
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""TSADropboxRetrieval.ipynb
|
|
|
|
| 3 |
Automatically generated by Colaboratory.
|
|
|
|
| 4 |
Original file is located at
|
| 5 |
https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
|
| 6 |
"""
|
|
|
|
| 203 |
return df
|
| 204 |
|
| 205 |
|
| 206 |
+
def getPathtoPDF_File(nameofPDF,progress_callback=None):
|
| 207 |
parquetDf = load_parquet_df()
|
| 208 |
nameofPDF=nameofPDF.replace('"', '')
|
| 209 |
try:
|
| 210 |
# path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
|
| 211 |
path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display']
|
| 212 |
+
if progress_callback:
|
| 213 |
+
progress_callback(60)
|
| 214 |
link=getSharedLink(path)
|
| 215 |
print(path,link)
|
| 216 |
except:
|
|
|
|
| 225 |
data = res.content
|
| 226 |
return data
|
| 227 |
|
| 228 |
+
def retrieveProjects(projname, progress_callback=None):
|
| 229 |
+
# if progress_callback:
|
| 230 |
+
progress_callback(20)
|
| 231 |
|
| 232 |
projname = '/' + projname.split(' ')[0] # Extract main project name
|
| 233 |
projname = projname.replace('/"', '') # Remove unwanted characters
|
|
|
|
| 237 |
documentsToMeasure = []
|
| 238 |
RelevantDocuments = []
|
| 239 |
|
| 240 |
+
# Send progress update (20%)
|
| 241 |
+
if progress_callback:
|
| 242 |
+
progress_callback(40)
|
| 243 |
+
|
| 244 |
# Store the original path before converting it to lowercase
|
| 245 |
parquetDf['original_path_display'] = parquetDf['path_display']
|
| 246 |
|
| 247 |
# Create a lowercase column for case-insensitive matching
|
| 248 |
parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
|
| 249 |
+
if progress_callback:
|
| 250 |
+
progress_callback(50)
|
| 251 |
# Filter using the lowercase column but retrieve the original paths
|
| 252 |
mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x)
|
| 253 |
+
if progress_callback:
|
| 254 |
+
progress_callback(60)
|
| 255 |
# Retrieve the original (case-sensitive) paths before lowering them
|
| 256 |
RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
|
| 257 |
documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
|
| 258 |
+
|
| 259 |
+
# Send progress update (80%)
|
| 260 |
+
if progress_callback:
|
| 261 |
+
progress_callback(70)
|
| 262 |
|
| 263 |
# Extract path from the original (case-sensitive) column
|
| 264 |
if RelevantDocuments:
|
|
|
|
| 268 |
|
| 269 |
# Remove temporary columns
|
| 270 |
parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True)
|
| 271 |
+
if progress_callback:
|
| 272 |
+
progress_callback(80)
|
| 273 |
return documentsToMeasure, RelevantDocuments, extracted_path
|
|
|