Spaces:
Running
Running
| import os | |
| import json | |
| import requests | |
| from datetime import datetime | |
| from llama_runner import summarize_text | |
| REPO_API = "https://api.github.com/repos/lonardonifabio/Artificial-Intelligence-and-Data-Science/contents/" | |
| DOC_FOLDER = "docs_temp/" | |
| STORE_FILE = "document_store.json" | |
| os.makedirs(DOC_FOLDER, exist_ok=True) | |
| def get_github_files(): | |
| response = requests.get(REPO_API) | |
| return [f for f in response.json() if f["name"].endswith(".pdf")] | |
| def process_new_documents(): | |
| files = get_github_files() | |
| try: | |
| with open(STORE_FILE, "r") as f: | |
| processed = {d["titolo"] for d in json.load(f)} | |
| except FileNotFoundError: | |
| processed = set() | |
| with open(STORE_FILE, "w") as f: | |
| json.dump([], f) | |
| new_docs = [] | |
| for file in files: | |
| if file["name"] in processed: | |
| continue | |
| r = requests.get(file["download_url"]) | |
| pdf_path = os.path.join(DOC_FOLDER, file["name"]) | |
| with open(pdf_path, "wb") as f: | |
| f.write(r.content) | |
| # Estrazione testo grezza (es. via PyMuPDF o pdfminer) | |
| from PyPDF2 import PdfReader | |
| reader = PdfReader(pdf_path) | |
| text = "\n".join([p.extract_text() for p in reader.pages if p.extract_text()]) | |
| result = summarize_text(text) | |
| result["titolo"] = file["name"] | |
| result["link"] = file["html_url"] | |
| result["data_caricamento"] = datetime.utcnow().isoformat() | |
| new_docs.append(result) | |
| # Append to store | |
| if new_docs: | |
| with open(STORE_FILE, "r") as f: | |
| data = json.load(f) | |
| data.extend(new_docs) | |
| with open(STORE_FILE, "w") as f: | |
| json.dump(data, f, indent=2) | |