File size: 1,750 Bytes
7c17b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import json
import requests
from datetime import datetime
from llama_runner import summarize_text

REPO_API = "https://api.github.com/repos/lonardonifabio/Artificial-Intelligence-and-Data-Science/contents/"
DOC_FOLDER = "docs_temp/"
STORE_FILE = "document_store.json"

os.makedirs(DOC_FOLDER, exist_ok=True)

def get_github_files():
    response = requests.get(REPO_API)
    return [f for f in response.json() if f["name"].endswith(".pdf")]

def process_new_documents():
    files = get_github_files()
    try:
        with open(STORE_FILE, "r") as f:
            processed = {d["titolo"] for d in json.load(f)}
    except FileNotFoundError:
        processed = set()
        with open(STORE_FILE, "w") as f:
            json.dump([], f)

    new_docs = []
    for file in files:
        if file["name"] in processed:
            continue
        r = requests.get(file["download_url"])
        pdf_path = os.path.join(DOC_FOLDER, file["name"])
        with open(pdf_path, "wb") as f:
            f.write(r.content)

        # Estrazione testo grezza (es. via PyMuPDF o pdfminer)
        from PyPDF2 import PdfReader
        reader = PdfReader(pdf_path)
        text = "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])

        result = summarize_text(text)
        result["titolo"] = file["name"]
        result["link"] = file["html_url"]
        result["data_caricamento"] = datetime.utcnow().isoformat()

        new_docs.append(result)

    # Append to store
    if new_docs:
        with open(STORE_FILE, "r") as f:
            data = json.load(f)
        data.extend(new_docs)
        with open(STORE_FILE, "w") as f:
            json.dump(data, f, indent=2)