Spaces:

bisryy
/

ContentWeaverAI

Sleeping

File size: 13,308 Bytes

2d7fa45
 
 
 
 
 
0f7b2e5

import os
os.environ['TRANSFORMERS_CACHE'] = '/tmp/.cache'
os.environ['HF_HOME'] = '/tmp/.cache'
os.environ['HF_DATASETS_CACHE'] = '/tmp/.cache'
os.environ['HF_METRICS_CACHE'] = '/tmp/.cache'

import feedparser
import requests
from bs4 import BeautifulSoup
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import time
import json
import uuid
from dotenv import load_dotenv

load_dotenv()

# --- GLOBAL SETUP ---

# Load embedding model once
print('Loading embedding model...')
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print('Embedding model loaded.')

# Initialize Chroma client once
client = chromadb.Client()
collection_name = "newsletter_articles"

# Load LLM once
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

print(f"Loading LLM: {model_id}")

from huggingface_hub import login
hf_token = os.getenv('HF_Token')
if hf_token:
    login(token=hf_token)
else:
    print("HF_Token not found in environment. Check your .env file.")

tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    print("Warning: pad_token is None. Setting pad_token to eos_token.")
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
print("LLM loaded.")

llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)


# --- MAIN FUNCTION ---

def run_newsletter_workflow(prferences_dict):
    user_preferences = {
        "id": str(uuid.uuid4()),
        "keywords": prferences_dict.get("keywords", []),
        "preferred_tone": prferences_dict.get("preferred_tone", 'informative'),
    }
    
    if not user_preferences["keywords"]:
        return None, "No Keywords provided"
    
    rss_feed_urls = [
        "http://feeds.feedburner.com/TechCrunch/artificial-intelligence",
        "https://news.mit.edu/topic/mitcobrand-artificial-intelligence2-rss.xml",
        "https://hackingbutlegal.com/feed/",
    ]
    
    def fetch_articles_from_feeds(feed_urls):
        articles = []
        for url in feed_urls:
            try:
                feed = feedparser.parse(url)
                for entry in feed.entries:
                    articles.append({
                        "id": str(uuid.uuid4()),
                        "title": entry.title,
                        "link": entry.link,
                        "published": entry.get("published", "N/A"),
                        "summary": entry.get("summary", ""),
                        "content": entry.get("content", [{"value": entry.get("summary", "")}])[0].get("value", entry.get("summary", ""))
                    })
                print(f"Fetched {len(feed.entries)} entries from {url}")
                time.sleep(1)
            except Exception as e:
                print(f"Error fetching feed {url}: {e}")
        return articles
    
    fetched_articles = fetch_articles_from_feeds(rss_feed_urls)
    print(f"\nFetched a total of {len(fetched_articles)} articles.")
    
    def scrape_article_content(url):
        try:
            headers = { 'User-Agent': 'MyNewsletterBot/1.0 (+http://example.com/botinfo)'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status() 
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            main_content = soup.find('article') or soup.find('main') or soup.find('div', class_ = 'content')
            if main_content:
                text = ' '.join(main_content.stripped_strings)
                return text[:5000]
            else:
                paragraphs = soup.find_all('p')
                text = ' '.join(p.get_text() for p in paragraphs) 
                return text[:5000]
        except requests.exceptions.RequestException as e:
            print(f"Scraping error for {url}: {e}")
            return None
        except Exception as e:
            print(f'Scraping Parsing error for {url}: {e}')
            return None
        
    for article in fetched_articles:
        print(f"Attempting to scrape: {article['link']}")
        full_content = scrape_article_content(article['link'])
        if full_content:
            article['content'] = full_content
        time.sleep(2)
    
    # Setup Chroma collection (delete existing, create new)
    try:
        client.delete_collection(name=collection_name)
        print(f'Deleted existing collection: {collection_name}')
    except Exception:
        pass
    collection = client.create_collection(name=collection_name)
    print(f"Created collection: {collection_name}")
    
    print("Adding articles to Vector DB...")
    ids_to_add = []
    embeddings_to_add = []
    documents_to_add = [] 
    metadata_to_add = []
    
    def clean_text(text):
        return ' '.join(text.split())
    
    for article in fetched_articles:
        cleaned_content = clean_text(article['content'])
        if not cleaned_content:
            continue
        
        ids_to_add.append(article['id'])
        documents_to_add.append(cleaned_content)
        metadata_to_add.append({
            "title": article['title'],
            "link": article['link'],
            "published": article['published']
        })
        
        embedding = embedding_model.encode(cleaned_content, convert_to_tensor=True)
        embeddings_to_add.append(embedding.tolist())
        
    if ids_to_add:
        collection.add(
            ids=ids_to_add,
            embeddings=embeddings_to_add,
            documents=documents_to_add,
            metadatas=metadata_to_add
        )
        print(f"Added {len(ids_to_add)} articles to the collection.")
    else:
        print("No valid articles found to add to the collection.")
    
    def retrieve_relevent_articles(query_keywords, top_n=5):
        if collection.count() == 0:
            print("Collection is empty. Cannot retrieve.")
            return []
        query_text = " ".join(query_keywords)
        query_embedding = embedding_model.encode(query_text, convert_to_tensor=False).tolist()
        
        print(f"\nQuerying for articles related to: '{query_text}'")
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=top_n,
            include=['metadatas', 'documents']
        )
        print(f'Retrieved {len(results["ids"][0])} articles.')
        return results

    relevent_articles_data = retrieve_relevent_articles(user_preferences['keywords'], top_n=3)
    print("\nRelevent data sample:")
    print(json.dumps(relevent_articles_data, indent=2))


    def generate_summary(article_content, max_length=150):
        max_input_length = 3000
        truncated_content = tokenizer.decode(
            tokenizer.encode(article_content, max_length=max_input_length, truncation=True)
        )
        
        messages = [
            {"role": "system", "content": "You are a helpful assistant that summarizes articles concisely."},
            {"role": "user", "content": f"Please summarize the following article:\n\n{truncated_content}\n\nSummary:"}  
        ]
        
        try:
            prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        except Exception:
            prompt = (
                f"System: You are a helpful assistant that summarizes articles concisely.\n"
                f"User: Please summarize the following article:\n\n{truncated_content}\n\nSummary:\nAssistant:"
            )
            
        print(f"\nGenerating summary...")
        
        sequences = llm_pipeline(
            prompt,
            max_new_tokens=max_length + 50,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
        
        try:
            summary = sequences[0]['generated_text']
            
            assistant_marker = "Assistant:"
            summary_start_index = summary.rfind(assistant_marker)
            if summary_start_index != -1:
                summary = summary[summary_start_index + len(assistant_marker):].strip()
            else:
                summary = summary.replace(prompt, "").strip()
                
            print("Summary generated.")
            return summary
        except Exception as e:
            print(f"Error processing LLM output: {e}")
            return "Error generating summary."
        
    summaries = {}
    if relevent_articles_data and relevent_articles_data.get('ids'):
        for i, article_id in enumerate(relevent_articles_data['ids'][0]):
            content = relevent_articles_data['documents'][0][i]
            title = relevent_articles_data['metadatas'][0][i]['title']
            print(f"\nProcessing article: {title}")
            summaries[article_id] = generate_summary(content)
            time.sleep(1)
    else:
        print("No relevent articles retrieved to summarize.")
        
    def generate_commentary(summary, title, user_tone, max_length=75):
        
        messages = [
            {"role": "system", "content": f"You are a content curator writing brief, engaging commentary for a newsletter. Adopt a {user_tone} tone."},
            {"role": "user", "content": f"Write a short comment (1-2 sentences) about the following article summary titled '{title}'. Relate it briefly to general interests in AI if possible, but focus on being engaging.\n\nSummary: {summary}\n\nCommentary:"}
        ]
        try:
            prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        except Exception:
            prompt = (
                f"System: You are a content curator writing brief, engaging commentary for a newsletter. Adopt a {user_tone} tone.\n"
                f"User: Write a short comment (1-2 sentences) about the following article summary titled '{title}'. Relate it briefly to general interests in AI if possible, but focus on being engaging.\n\nSummary: {summary}\n\nCommentary:\nAssistant:"
            )
     
        print(f'Generating commentary for: {title}')
        
        sequences = llm_pipeline(
            prompt,
            max_new_tokens=max_length + 30,
            do_sample=True,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
        
        try:
            commentary = sequences[0]['generated_text']
            assistant_marker = "Assistant:"
            commentary_start_index = commentary.rfind(assistant_marker)
            if commentary_start_index != -1:
                commentary = commentary[commentary_start_index + len(assistant_marker):].strip()
            else:
                commentary = commentary.replace(prompt, "").strip()
                
            print('Commentary generated.')
            return commentary
        except Exception as e:
            print(f"Error processing LLM output for commentary: {e}")
            return "Error generating commentary"
        
    commentaries = {}
    if relevent_articles_data and relevent_articles_data.get('ids'):
        for i, article_id in enumerate(relevent_articles_data['ids'][0]):
            if article_id in summaries:
                title = relevent_articles_data['metadatas'][0][i]['title']
                summary_text = summaries[article_id]
                commentaries[article_id] = generate_commentary(summary_text, title, user_preferences["preferred_tone"])
                time.sleep(1)
    
    def format_newsletter(retrieved_data, summaries_dict, commentaries_dict):
        newsletter = "# Your AI Agent & Workflow Digest 📰\n\n"
        newsletter += "Here are some articles curated based on your interests:\n\n"

        if not retrieved_data or not retrieved_data.get('ids') or not retrieved_data['ids'][0]:
            newsletter += "No relevant articles found this time."
            return newsletter
        
        for i, article_id in enumerate(retrieved_data['ids'][0]):
            metadata = retrieved_data['metadatas'][0][i]
            summary = summaries_dict.get(article_id, "Summary not available.")
            commentary = commentaries_dict.get(article_id, "")
            
            newsletter += f"## {metadata['title']}\n\n"
            newsletter += f"**Source:** [{metadata['link']}]({metadata['link']})\n"
            newsletter += f"**Published:** {metadata['published']}\n\n"
            newsletter += f"**Summary:** {summary}\n\n"
            if commentary:
                newsletter += f"**Quick Take:** {commentary}\n\n"
            newsletter += "---\n\n"
        return newsletter
    
    final_newsletter = format_newsletter(relevent_articles_data, summaries, commentaries)
    
    print("\n\n--- GENERATED NEWSLETTER ---")
    print(final_newsletter)
    print("--- END OF NEWSLETTER ---")

    return final_newsletter, "Newsletter generation successful."