Spaces:
Sleeping
Sleeping
Commit
·
08a92ff
1
Parent(s):
1413056
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,6 +34,24 @@ def finder(url, soup, media_type):
|
|
| 34 |
files.append(file_url)
|
| 35 |
return files
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
def scrapper(url):
|
| 38 |
try:
|
| 39 |
response = requests.get(url, timeout=10)
|
|
@@ -55,10 +73,7 @@ def scrapper(url):
|
|
| 55 |
full_text += line + ' '
|
| 56 |
|
| 57 |
# Initialize the summarization pipeline
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
# Summarize the content
|
| 61 |
-
summary = summarizer(full_text, max_length=200, min_length=50, do_sample=False)
|
| 62 |
|
| 63 |
# Extract the summary text
|
| 64 |
summary_text = summary[0]['summary_text']
|
|
|
|
| 34 |
files.append(file_url)
|
| 35 |
return files
|
| 36 |
|
| 37 |
+
def summarize_long_text(text, chunk_size=1024):
|
| 38 |
+
# Initialize the summarization pipeline
|
| 39 |
+
summarizer = pipeline('summarization')
|
| 40 |
+
|
| 41 |
+
# Tokenize the text into words
|
| 42 |
+
words = text.split()
|
| 43 |
+
|
| 44 |
+
# Split the words into chunks of the specified size
|
| 45 |
+
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
| 46 |
+
|
| 47 |
+
# Summarize each chunk
|
| 48 |
+
summarized_chunks = [summarizer(chunk, max_length=1024, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]
|
| 49 |
+
|
| 50 |
+
# Combine the summarized chunks into the final summary
|
| 51 |
+
final_summary = ' '.join(summarized_chunks)
|
| 52 |
+
|
| 53 |
+
return final_summary
|
| 54 |
+
|
| 55 |
def scrapper(url):
|
| 56 |
try:
|
| 57 |
response = requests.get(url, timeout=10)
|
|
|
|
| 73 |
full_text += line + ' '
|
| 74 |
|
| 75 |
# Initialize the summarization pipeline
|
| 76 |
+
summary = summarize_long_text(full_text)
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Extract the summary text
|
| 79 |
summary_text = summary[0]['summary_text']
|