Spaces:

tferhan
/

data_gov_ma

Sleeping

tferhan commited on Jun 6, 2024

Commit

3e4b286

verified ·

1 Parent(s): bf95cbd

Update document_scrapped.py

Files changed (1) hide show

document_scrapped.py CHANGED Viewed

@@ -1,26 +1,4 @@
-# -*- coding: utf-8 -*-
-"""document_scrapped.ipynb
-Automatically generated by Colab.
-Original file is located at
-    https://colab.research.google.com/drive/1cVGt7jq8uw5FYIwWOlUTAFbdVhPkU1FJ
-"""
 import re
-def select_words_until_char_limit(s, char_limit):
-    s_no_punct = re.sub(r'[^\w\s]', '', s)  # remove punctuation, but leave spaces
-    words = s_no_punct.split()
-    selected_words = []
-    total_chars = 0
-    for word in words:
-        if total_chars + len(word) + 1 <= char_limit:
-            selected_words.append(word)
-            total_chars += len(word) + 1  # add 1 for the space
-        else:
-            break
-    return ' '.join(selected_words)
 from bs4 import BeautifulSoup
 import requests
 import json
@@ -50,6 +28,22 @@ from unidecode import unidecode
 from langchain_huggingface import HuggingFaceEndpoint
 import os
 def downl(url):
   try:
     rq = requests.get(url)
@@ -183,10 +177,10 @@ def pptx(url : str) -> str:
         print(f"An error occurred: {e}")
         return 'No data avaible'
-def get_data(url : str) -> str:
-  jo = downl(url)
   ext = jo.split(".")[-1]
-  print(ext)
   if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
     rs = excel(jo)
     return rs
@@ -202,4 +196,4 @@ def get_data(url : str) -> str:
   elif ext == 'pptx' or ext == 'ppt':
     rs = pptx(jo)
     return rs
-  return "No data returned"

 import re
 from bs4 import BeautifulSoup
 import requests
 import json
 from langchain_huggingface import HuggingFaceEndpoint
 import os
+def select_words_until_char_limit(s, char_limit):
+    s_no_punct = re.sub(r'[^\w\s]', '', s)  # remove punctuation, but leave spaces
+    words = s_no_punct.split()
+    selected_words = []
+    total_chars = 0
+    for word in words:
+        if total_chars + len(word) + 1 <= char_limit:
+            selected_words.append(word)
+            total_chars += len(word) + 1  # add 1 for the space
+        else:
+            break
+    return ' '.join(selected_words)
 def downl(url):
   try:
     rq = requests.get(url)
         print(f"An error occurred: {e}")
         return 'No data avaible'
+def get_data(url):
+  ki = url.replace('\nObservation', '').replace('"\nObservation', '')
+  jo = downl(ki)
   ext = jo.split(".")[-1]
   if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
     rs = excel(jo)
     return rs
   elif ext == 'pptx' or ext == 'ppt':
     rs = pptx(jo)
     return rs
+  return "No data returned"