import os import json import requests from datetime import datetime from llama_runner import summarize_text REPO_API = "https://api.github.com/repos/lonardonifabio/Artificial-Intelligence-and-Data-Science/contents/" DOC_FOLDER = "docs_temp/" STORE_FILE = "document_store.json" os.makedirs(DOC_FOLDER, exist_ok=True) def get_github_files(): response = requests.get(REPO_API) return [f for f in response.json() if f["name"].endswith(".pdf")] def process_new_documents(): files = get_github_files() try: with open(STORE_FILE, "r") as f: processed = {d["titolo"] for d in json.load(f)} except FileNotFoundError: processed = set() with open(STORE_FILE, "w") as f: json.dump([], f) new_docs = [] for file in files: if file["name"] in processed: continue r = requests.get(file["download_url"]) pdf_path = os.path.join(DOC_FOLDER, file["name"]) with open(pdf_path, "wb") as f: f.write(r.content) # Estrazione testo grezza (es. via PyMuPDF o pdfminer) from PyPDF2 import PdfReader reader = PdfReader(pdf_path) text = "\n".join([p.extract_text() for p in reader.pages if p.extract_text()]) result = summarize_text(text) result["titolo"] = file["name"] result["link"] = file["html_url"] result["data_caricamento"] = datetime.utcnow().isoformat() new_docs.append(result) # Append to store if new_docs: with open(STORE_FILE, "r") as f: data = json.load(f) data.extend(new_docs) with open(STORE_FILE, "w") as f: json.dump(data, f, indent=2)