llm-summarizer / src /summarize.py
kshitij230's picture
Upload 4 files
0fd36e0 verified
import fitz # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from concurrent.futures import ThreadPoolExecutor
# Load Hugging Face TinyLlama model
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def generate_summary(prompt, max_new_tokens=256):
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def load_pdf_chunks(file, min_len=100):
doc = fitz.open(stream=file.read(), filetype="pdf")
chunks = []
for page in doc:
text = page.get_text()
for para in text.split("\n\n"):
para = para.strip()
if len(para) > min_len:
chunks.append(para)
return chunks
def chunk_sections(chunks, size=5):
return [chunks[i:i+size] for i in range(0, len(chunks), size)]
def summarize_chunks(chunks):
# First-level chunk summaries
with ThreadPoolExecutor(max_workers=4) as executor:
summaries = list(executor.map(lambda c: generate_summary(f"Summarize:\n{c}"), chunks))
# Mid-level summaries per section
section_chunks = chunk_sections(summaries, 5)
section_summaries = [generate_summary("Combine these summaries:\n" + "\n".join(sec)) for sec in section_chunks]
# Final abstract
final_summary = generate_summary("Write a final summary:\n" + "\n".join(section_summaries), max_new_tokens=300)
return final_summary