Spaces:
Runtime error
Runtime error
| import os | |
| from transformers import pipeline, BartTokenizer | |
| from pydantic import BaseModel | |
| # Initialize the summarization pipeline | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # device=-1 for CPU | |
| # Initialize the tokenizer | |
| tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn") | |
| class TextRequest(BaseModel): | |
| text: str | |
| # Paths | |
| folder_path = "jfk_text" # Folder containing the documents | |
| output_dir = "summaryoutput" # Folder to save summaries | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Get a list of already summarized files | |
| existing_summaries = set(os.listdir(output_dir)) | |
| # Function to split text into meaningful chunks (e.g., paragraphs or sentences) | |
| def split_text_into_chunks(text, max_tokens=500): | |
| paragraphs = text.split("\n\n") # Split by double newlines (paragraphs) | |
| chunks = [] | |
| current_chunk = "" | |
| for paragraph in paragraphs: | |
| tokens = tokenizer.tokenize(paragraph) | |
| if len(tokens) > max_tokens: | |
| words = paragraph.split() | |
| sub_chunks = [' '.join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)] | |
| chunks.extend(sub_chunks) | |
| else: | |
| if len(tokenizer.tokenize(current_chunk + " " + paragraph)) > max_tokens: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = paragraph | |
| else: | |
| current_chunk += " " + paragraph | |
| if current_chunk.strip(): | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # Process each document separately | |
| def summarize_all_files(): | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith(".md"): # Process Markdown files | |
| summary_filename = f"summary_{filename}" | |
| summary_filepath = os.path.join(output_dir, summary_filename) | |
| if summary_filename in existing_summaries: | |
| print(f"β Skipping {filename}, already summarized.") | |
| continue # Skip this file | |
| file_path = os.path.join(folder_path, filename) | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| text = file.read() | |
| # Split text into chunks | |
| chunks = split_text_into_chunks(text) | |
| # Summarize each chunk | |
| summaries = [] | |
| for chunk in chunks: | |
| try: | |
| summary = summarizer( | |
| chunk, | |
| max_length=min(130, len(tokenizer.tokenize(chunk))), # Ensure max_length is not more than input tokens | |
| min_length=min(30, len(tokenizer.tokenize(chunk)) // 2), # Ensure min_length is reasonable | |
| do_sample=False | |
| ) | |
| summaries.append(summary[0]['summary_text']) | |
| except Exception as e: | |
| print(f"Error summarizing chunk in {filename}: {e}") | |
| # Save individual document summary | |
| summary_text = "\n\n".join(summaries) | |
| with open(summary_filepath, "w", encoding="utf-8") as f: | |
| f.write(summary_text) | |
| print(f"β Summary saved for {filename} -> {summary_filepath}") | |
| print("π All files summarized successfully!") | |
| if __name__ == "__main__": | |
| summarize_all_files() |