File size: 10,490 Bytes
cf10c85
 
9a08a0f
54046f4
 
cf10c85
38db139
739c95c
cf10c85
 
07ceb61
cf10c85
1662274
cf10c85
54046f4
cf10c85
 
739c95c
dbdbe8c
cf10c85
3bdc160
739c95c
bab9790
1662274
54046f4
 
 
 
 
 
 
 
 
 
 
4fb18c4
cf10c85
9271377
8ce83ce
 
9271377
8ce83ce
cf10c85
 
 
1662274
cf10c85
54046f4
cf10c85
d3c98a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf10c85
 
a7c55d0
 
41abbcb
 
 
 
 
 
 
 
 
 
 
 
cf10c85
9271377
41abbcb
 
d3c98a4
 
 
1662274
d3c98a4
 
41abbcb
 
 
54046f4
 
41abbcb
a7c55d0
 
 
 
 
54046f4
 
41abbcb
54046f4
41abbcb
 
54046f4
41abbcb
 
 
 
54046f4
cf10c85
41abbcb
 
 
54046f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1662274
 
54046f4
d3c98a4
 
41abbcb
d3c98a4
54046f4
a7c55d0
cf10c85
 
4f6bd49
46fd3e1
 
 
9a08a0f
522b0df
38db139
8c48251
522b0df
8c48251
54046f4
 
8c48251
 
1662274
cf10c85
1662274
54046f4
1662274
4f6bd49
 
1662274
 
4f6bd49
54046f4
 
4f6bd49
cf10c85
54046f4
 
7e1ea76
54046f4
 
1662274
54046f4
7e1ea76
54046f4
 
 
739c95c
 
54046f4
4f6bd49
 
 
1662274
 
4f6bd49
 
 
 
 
54046f4
cf10c85
7072ceb
1662274
 
 
 
4f6bd49
 
 
 
1662274
 
4f6bd49
 
 
1662274
 
 
 
 
 
 
 
 
 
 
 
 
 
54046f4
 
 
 
 
 
 
1662274
54046f4
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import os
import feedparser
from chromadb import PersistentClient
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
import logging
from huggingface_hub import HfApi, login, snapshot_download
from datetime import datetime
import dateutil.parser
import hashlib
import json
import re
import requests # Ensure requests is imported

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

LOCAL_DB_DIR = "chroma_db"
FEEDS_FILE = "rss_feeds.json"
COLLECTION_NAME = "news_articles"
HF_API_TOKEN = os.getenv("HF_TOKEN")
REPO_ID = "broadfield-dev/news-rag-db"
MAX_ARTICLES_PER_FEED = 1000
RAW_FEEDS_DIR = "raw_rss_feeds" # Directory for raw RSS files

def initialize_hf_api():
    if not HF_API_TOKEN:
        logger.error("Hugging Face API token (HF_TOKEN) not set.")
        raise ValueError("HF_TOKEN environment variable is not set.")
    try:
        login(token=HF_API_TOKEN)
        return HfApi()
    except Exception as e:
        logger.error(f"Failed to login to Hugging Face Hub: {e}")
        raise
hf_api = initialize_hf_api()

def get_embedding_model():
    if not hasattr(get_embedding_model, "model"):
        get_embedding_model.model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return get_embedding_model.model

def clean_text(text):
    if not text or not isinstance(text, str):
        return ""
    text = re.sub(r'<|.*?>', '', text)
    text = ' '.join(text.split())
    return text.strip()

def save_raw_rss_to_file(feed_url, content):
    """Saves the raw RSS content to a file."""
    if not os.path.exists(RAW_FEEDS_DIR):
        os.makedirs(RAW_FEEDS_DIR)
    
    # Create a safe filename from the URL
    filename = re.sub(r'[^a-zA-Z0-9]', '_', feed_url) + ".xml"
    filepath = os.path.join(RAW_FEEDS_DIR, filename)
    
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        logger.info(f"Saved raw RSS from {feed_url} to {filepath}")
    except Exception as e:
        logger.error(f"Could not save raw RSS from {feed_url}: {e}")

def fetch_rss_feeds():
    articles = []
    seen_links = set()

    try:
        with open(FEEDS_FILE, 'r') as f:
            feed_categories = json.load(f)
    except FileNotFoundError:
        logger.error(f"{FEEDS_FILE} not found. No feeds to process.")
        return []

    for category, feeds in feed_categories.items():
        for feed_info in feeds:
            feed_url = feed_info.get("url")
            if not feed_url:
                logger.warning(f"Skipping feed with no URL in category '{category}'")
                continue

            try:
                logger.info(f"Fetching {feed_url}")
                response = requests.get(feed_url, headers={'User-Agent': 'Mozilla/5.0'})
                response.raise_for_status()
                raw_content = response.text
                save_raw_rss_to_file(feed_url, raw_content) # Save the raw feed

                feed = feedparser.parse(raw_content)
                if feed.bozo:
                    logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
                    continue
                
                for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
                    link = entry.get("link", "")
                    if not link or link in seen_links:
                        continue
                    
                    seen_links.add(link)
                    title = entry.get("title", "No Title")
                    description_raw = entry.get("summary", entry.get("description", ""))
                    description = clean_text(description_raw)

                    if not description:
                        continue

                    published_str = "Unknown Date"
                    for date_field in ["published", "updated", "created", "pubDate"]:
                        if date_field in entry:
                            try:
                                parsed_date = dateutil.parser.parse(entry[date_field])
                                published_str = parsed_date.isoformat()
                                break
                            except (ValueError, TypeError):
                                continue
                    
                    image = "svg"
                    image_sources = [
                        lambda e: e.get("media_content", [{}])[0].get("url") if e.get("media_content") else None,
                        lambda e: e.get("media_thumbnail", [{}])[0].get("url") if e.get("media_thumbnail") else None,
                        lambda e: e.get("enclosure", {}).get("url") if e.get("enclosure") and e.get("enclosure", {}).get('type', '').startswith('image') else None,
                        lambda e: next((lnk.get("href") for lnk in e.get("links", []) if lnk.get("type", "").startswith("image")), None),
                    ]
                    for source_func in image_sources:
                        try:
                            img_url = source_func(entry)
                            if img_url and isinstance(img_url, str) and img_url.strip():
                                image = img_url
                                break
                        except (IndexError, AttributeError, TypeError):
                            continue
                    
                    articles.append({
                        "title": title, "link": link, "description": description,
                        "published": published_str, "category": category, "image": image,
                    })
            except requests.exceptions.RequestException as e:
                logger.error(f"Error fetching {feed_url}: {e}")
            except Exception as e:
                logger.error(f"Error processing {feed_url}: {e}")
    
    logger.info(f"Total unique articles fetched: {len(articles)}")
    return articles

def process_and_store_articles(articles):
    if not os.path.exists(LOCAL_DB_DIR):
        os.makedirs(LOCAL_DB_DIR)
    
    client = PersistentClient(path=LOCAL_DB_DIR)
    collection = client.get_or_create_collection(name=COLLECTION_NAME)
    
    try:
        existing_ids = set(collection.get(include=[])["ids"])
        logger.info(f"Loaded {len(existing_ids)} existing document IDs from {LOCAL_DB_DIR}.")
    except Exception:
        logger.info("No existing DB found or it is empty. Starting fresh.")
        existing_ids = set()

    contents_to_add, metadatas_to_add, ids_to_add = [], [], []
    for article in articles:
        if not article.get('link'): continue
        doc_id = hashlib.sha256(article['link'].encode('utf-8')).hexdigest()
        if doc_id in existing_ids: continue
        
        metadata = {
            "title": article["title"], "link": article["link"], "published": article["published"],
            "category": article["category"], "image": article["image"],
        }
        contents_to_add.append(article["description"])
        metadatas_to_add.append(metadata)
        ids_to_add.append(doc_id)
    
    if ids_to_add:
        logger.info(f"Found {len(ids_to_add)} new articles to add to the database.")
        try:
            embedding_model = get_embedding_model()
            embeddings_to_add = embedding_model.embed_documents(contents_to_add)
            collection.add(embeddings=embeddings_to_add, documents=contents_to_add, metadatas=metadatas_to_add, ids=ids_to_add)
            logger.info(f"Successfully added {len(ids_to_add)} new articles to DB. Total in DB: {collection.count()}")
        except Exception as e:
            logger.error(f"Error storing articles in ChromaDB: {e}", exc_info=True)
    else:
        logger.info("No new articles to add to the database.")

def download_from_hf_hub():
    if not os.path.exists(os.path.join(LOCAL_DB_DIR, "chroma.sqlite3")):
        try:
            logger.info(f"Downloading Chroma DB from {REPO_ID} to {LOCAL_DB_DIR}...")
            snapshot_download(
                repo_id=REPO_ID, repo_type="dataset", local_dir=".",
                local_dir_use_symlinks=False, allow_patterns=[f"{LOCAL_DB_DIR}/**"], token=HF_API_TOKEN
            )
            logger.info("Finished downloading DB.")
        except Exception as e:
            logger.warning(f"Could not download from Hugging Face Hub (this is normal on first run): {e}")
    else:
        logger.info(f"Local Chroma DB found at '{LOCAL_DB_DIR}', skipping download.")

def upload_to_hf_hub():
    """Uploads both the ChromaDB and the raw RSS feeds to the Hugging Face Hub."""
    commit_message = f"Update RSS news database and raw feeds {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

    # Upload ChromaDB if it exists
    if os.path.exists(LOCAL_DB_DIR):
        try:
            logger.info(f"Uploading updated Chroma DB '{LOCAL_DB_DIR}' to {REPO_ID}...")
            hf_api.upload_folder(
                folder_path=LOCAL_DB_DIR, path_in_repo=LOCAL_DB_DIR, repo_id=REPO_ID,
                repo_type="dataset", commit_message=commit_message, ignore_patterns=["*.bak", "*.tmp"]
            )
            logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
        except Exception as e:
            logger.error(f"Error uploading Chroma DB to Hugging Face Hub: {e}", exc_info=True)
    
    # Upload Raw RSS Feeds directory if it exists
    if os.path.exists(RAW_FEEDS_DIR):
        try:
            logger.info(f"Uploading raw RSS feeds from '{RAW_FEEDS_DIR}' to {REPO_ID}...")
            hf_api.upload_folder(
                folder_path=RAW_FEEDS_DIR, path_in_repo=RAW_FEEDS_DIR, repo_id=REPO_ID,
                repo_type="dataset", commit_message=commit_message
            )
            logger.info(f"Raw feeds folder '{RAW_FEEDS_DIR}' uploaded to: {REPO_ID}")
        except Exception as e:
            logger.error(f"Error uploading raw feeds to Hugging Face Hub: {e}", exc_info=True)


def main():
    try:
        download_from_hf_hub()
        articles_to_process = fetch_rss_feeds()
        if articles_to_process:
            process_and_store_articles(articles_to_process)
            upload_to_hf_hub() # This now uploads both directories
        else:
            logger.info("No articles fetched, skipping database processing and upload.")
    except Exception as e:
        logger.critical(f"An unhandled error occurred in main execution: {e}", exc_info=True)

if __name__ == "__main__":
    main()