Spaces:
Sleeping
Sleeping
Update document_scrapped.py
Browse files- document_scrapped.py +20 -26
document_scrapped.py
CHANGED
|
@@ -1,26 +1,4 @@
|
|
| 1 |
-
# -*- coding: utf-8 -*-
|
| 2 |
-
"""document_scrapped.ipynb
|
| 3 |
-
|
| 4 |
-
Automatically generated by Colab.
|
| 5 |
-
|
| 6 |
-
Original file is located at
|
| 7 |
-
https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
import re
|
| 11 |
-
def select_words_until_char_limit(s, char_limit):
|
| 12 |
-
s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
|
| 13 |
-
words = s_no_punct.split()
|
| 14 |
-
selected_words = []
|
| 15 |
-
total_chars = 0
|
| 16 |
-
for word in words:
|
| 17 |
-
if total_chars + len(word) + 1 <= char_limit:
|
| 18 |
-
selected_words.append(word)
|
| 19 |
-
total_chars += len(word) + 1 # add 1 for the space
|
| 20 |
-
else:
|
| 21 |
-
break
|
| 22 |
-
return ' '.join(selected_words)
|
| 23 |
-
|
| 24 |
from bs4 import BeautifulSoup
|
| 25 |
import requests
|
| 26 |
import json
|
|
@@ -50,6 +28,22 @@ from unidecode import unidecode
|
|
| 50 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 51 |
import os
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
def downl(url):
|
| 54 |
try:
|
| 55 |
rq = requests.get(url)
|
|
@@ -183,10 +177,10 @@ def pptx(url : str) -> str:
|
|
| 183 |
print(f"An error occurred: {e}")
|
| 184 |
return 'No data avaible'
|
| 185 |
|
| 186 |
-
def get_data(url
|
| 187 |
-
|
|
|
|
| 188 |
ext = jo.split(".")[-1]
|
| 189 |
-
print(ext)
|
| 190 |
if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
|
| 191 |
rs = excel(jo)
|
| 192 |
return rs
|
|
@@ -202,4 +196,4 @@ def get_data(url : str) -> str:
|
|
| 202 |
elif ext == 'pptx' or ext == 'ppt':
|
| 203 |
rs = pptx(jo)
|
| 204 |
return rs
|
| 205 |
-
return "No data returned"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
import requests
|
| 4 |
import json
|
|
|
|
| 28 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 29 |
import os
|
| 30 |
|
| 31 |
+
|
| 32 |
+
def select_words_until_char_limit(s, char_limit):
|
| 33 |
+
s_no_punct = re.sub(r'[^\w\s]', '', s) # remove punctuation, but leave spaces
|
| 34 |
+
words = s_no_punct.split()
|
| 35 |
+
selected_words = []
|
| 36 |
+
total_chars = 0
|
| 37 |
+
for word in words:
|
| 38 |
+
if total_chars + len(word) + 1 <= char_limit:
|
| 39 |
+
selected_words.append(word)
|
| 40 |
+
total_chars += len(word) + 1 # add 1 for the space
|
| 41 |
+
else:
|
| 42 |
+
break
|
| 43 |
+
return ' '.join(selected_words)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
def downl(url):
|
| 48 |
try:
|
| 49 |
rq = requests.get(url)
|
|
|
|
| 177 |
print(f"An error occurred: {e}")
|
| 178 |
return 'No data avaible'
|
| 179 |
|
| 180 |
+
def get_data(url):
|
| 181 |
+
ki = url.replace('\nObservation', '').replace('"\nObservation', '')
|
| 182 |
+
jo = downl(ki)
|
| 183 |
ext = jo.split(".")[-1]
|
|
|
|
| 184 |
if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
|
| 185 |
rs = excel(jo)
|
| 186 |
return rs
|
|
|
|
| 196 |
elif ext == 'pptx' or ext == 'ppt':
|
| 197 |
rs = pptx(jo)
|
| 198 |
return rs
|
| 199 |
+
return "No data returned"
|