File size: 9,683 Bytes
c0ccb24
a673fe6
 
 
 
c0ccb24
b28111b
 
 
6ec3db0
a3e15a8
c0e2a86
c0ccb24
bcabca3
08ca1e7
bcabca3
3ce355a
8e7e845
bcabca3
 
 
 
 
 
3ce355a
 
ae52df1
 
 
31bbac8
c0ccb24
31bbac8
 
c0ccb24
 
 
 
 
a673fe6
 
 
 
 
 
 
 
 
 
31bbac8
c0ccb24
31bbac8
 
c0ccb24
 
16839f4
c0ccb24
651a123
 
c8a2689
 
 
 
651a123
d26dbd8
b28111b
 
 
 
 
c0ccb24
b28111b
 
 
3ce355a
b28111b
02c8464
 
 
 
 
c5b7cd2
 
 
 
 
 
 
02c8464
c5b7cd2
 
 
 
 
 
 
 
 
 
02c8464
c5b7cd2
28a76b0
6ec3db0
9a89576
 
31bbac8
e553354
31bbac8
 
 
9a89576
31bbac8
9a89576
 
 
651a123
c0ccb24
16839f4
8e18617
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04b0333
26434b4
 
fe47241
4f3995e
26434b4
fe47241
439154f
fe47241
439154f
4f3995e
 
 
 
439154f
fe47241
26434b4
fe47241
439154f
26434b4
 
04b0333
28a76b0
 
b28111b
4f0bcf9
de165fb
8e7e845
de165fb
 
ea6fa2a
d1a36ec
c6ea117
d1a36ec
28a76b0
aadb1c4
b28111b
28a76b0
 
d1a36ec
c6ea117
d1a36ec
c6ea117
a3e15a8
28a76b0
7dff81e
c6ea117
7dff81e
c6ea117
 
 
26434b4
c6ea117
 
 
7dff81e
 
 
26434b4
7dff81e
8e7e845
d1a36ec
c6ea117
d1a36ec
 
 
 
 
 
 
 
 
 
 
 
c6ea117
 
 
 
 
 
 
 
d1a36ec
b28111b
d1a36ec
 
 
 
 
 
 
979be81
31bbac8
c0ccb24
31bbac8
 
3a8152c
4d2800d
76effab
 
b7d4372
 
 
 
 
 
c7092a6
c0ccb24
6e96fb0
8903889
 
 
 
 
c07a707
8903889
c07a707
6e96fb0
c07a707
 
 
 
 
6e96fb0
c07a707
 
 
 
 
 
 
 
c0ccb24
16839f4
238f533
16839f4
c0ccb24
33aff6e
5e3a25e
 
 
 
 
 
 
 
 
 
 
 
 
 
fa0fe96
0949bb3
 
5e3a25e
 
 
1322c7a
 
 
3927bf2
 
73da755
 
33aff6e
 
 
 
73da755
 
 
 
 
33aff6e
 
 
 
 
16839f4
44d2d58
16839f4
 
e553354
 
c0ccb24
 
 
 
16839f4
 
 
 
 
e9515f9
 
d26dbd8
e9515f9
 
874f690
 
 
 
 
 
c0ccb24
e553354
16839f4
 
 
c0ccb24
31bbac8
16839f4
31bbac8
 
d3ff996
 
 
279ce8f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import json
import base64
import os
import shutil
import time
import gradio as gr

from rag.search import search
from rag.prompts import rag_prompt
from llm.reasoning import generate_reasoning
from llm.reasoning import generate_reasoning_from_prompt
from rag.ingest import ingest_pdfs_and_web, save_chunks

print("🔄 Startar RAG-ingest")
DATA_DIR = "rag/data"
start_time = time.perf_counter()

chunks = ingest_pdfs_and_web()
save_chunks(chunks, out_dir=DATA_DIR)

elapsed = time.perf_counter() - start_time

print(f"✅ Ingest klar – {len(chunks)} chunkar skapade")
print(f"⏱️ Ingest-tid: {elapsed:.2f} sekunder")


print("HF_TOKEN present:", bool(os.getenv("HF_TOKEN")))
print("HF_TOKEN length:", len(os.getenv("HF_TOKEN", "")))

# =====================================================
# DATA
# =====================================================

with open("content.json", encoding="utf-8") as f:
    DOCUMENTS = json.load(f)["documents"]

DOC_INDEX = {d["id"]: d for d in DOCUMENTS}

PUBLIC_DIR = "/tmp/gradio/public_pdfs"
os.makedirs(PUBLIC_DIR, exist_ok=True)

for file in os.listdir("rag/files"):
    if file.lower().endswith(".pdf"):
        shutil.copy(
            os.path.join("rag/files", file),
            os.path.join(PUBLIC_DIR, file)
        )

# =====================================================
# FUNKTIONER
# =====================================================

def load_document(doc_id):
    rows = [[q["question"]] for q in DOC_INDEX[doc_id]["subquestions"]]
    return rows, doc_id


def fill_message(evt: gr.SelectData):
    value = evt.value
    if isinstance(value, list):
        return value[0]
    return value

def submit(message, doc_id, debug_mode):
    """
    Central router:
    - Om message matchar en underfråga → vanlig Q&A
    - Annars → RAG över PDF-material
    """

    message = message.strip()
    if not message:
        return "", "<h3>Svar</h3>"

    # 1️⃣ Försök matcha mot valt dokument (klassisk väg)
    if doc_id and doc_id in DOC_INDEX:
        doc = DOC_INDEX[doc_id]

        for q in doc["subquestions"]:
            if q["question"] == message:
                fact_answer = format_answer(q["answer"])
                
                reasoning = generate_reasoning(
                    title=doc["title"],
                    main_question=doc["main_question"],
                    question=message,
                    answer=q["answer"]
                )
                
                combined = (
                    "### Svar\n\n"
                    + fact_answer
                    + "\n\n---\n\n"
                    + "### Resonemang\n\n"
                    + reasoning
                )
                
                return combined, "<h3>Svar</h3>"
    
    # 2️⃣ Ingen match → RAG-fritext
    return handle_rag_query(message, debug_mode)

def format_answer(answer):
    out = []
    for key, value in answer.items():
        out.append(f"**{key}**")
        if isinstance(value, list):
            for item in value:
                out.append(f"- {item}")
        else:
            out.append(value)
        out.append("")
    return "\n".join(out)


def clear_all():
    return [], "", "", None

def format_pages(pages):
    if not pages:
        return ""

    pages = sorted(set(pages))

    if len(pages) == 1:
        return f"s. {pages[0]}"

    # sammanhängande intervall
    if pages[-1] - pages[0] + 1 == len(pages):
        return f"s. {pages[0]}{pages[-1]}"

    return "s. " + ", ".join(str(p) for p in pages)
    
def format_source_link(chunk: dict) -> str:
    source = chunk.get("source", "Okänd källa")
    source_type = chunk.get("source_type")
    pages = chunk.get("pages")

    if source_type == "pdf":
        page_info = format_pages(pages)
        return (
            f"📄 "
            f"[{source}]("
            f"https://raw.githubusercontent.com/"
            f"tomashelmfridsson/systeminforande/main/{source}"
            f")"
            f"{' — ' + page_info if page_info else ''}"
        )

    if source_type == "web":
        return f"🌐 [{source}]({source})"

    return source
    
def handle_rag_query(query: str, debug: bool):
    results = search(query, top_k=5)

    if not results:
        return (
            "Det finns inget tillräckligt underlag i materialet för att besvara frågan.",
            "<h3>Svar</h3>"
        )

    # -----------------------------
    # Confidence score
    # -----------------------------
    scores = [score for score, _ in results]
    confidence = round(sum(scores) / len(scores), 2)

    chunks = [chunk for _, chunk in results]

    # -----------------------------
    # Generera svar
    # -----------------------------
    prompt = rag_prompt(query=query, chunks=chunks)
    answer = generate_reasoning_from_prompt(prompt)

    # -----------------------------
    # Bygg använda källor (VIKTIGT: DEFINIERAS HÄR)
    # -----------------------------
    used_sources = {}
    for _, c in results:
        used_sources[c["source"]] = c

    # -----------------------------
    # Källor (visas alltid)
    # -----------------------------
    sources_lines = ["\n\n---\n\n### Källor"]
    for c in used_sources.values():
        sources_lines.append(f"- {format_source_link(c)}")

    sources_md = "\n".join(sources_lines)

    # -----------------------------
    # Debug (valfritt)
    # -----------------------------
    debug_md = ""
    if debug:
        debug_lines = [
            "\n\n---\n\n### Debug",
            f"**Confidence:** {confidence}",
            ""
        ]

        for score, c in results:
            debug_lines.append(
                f"""**📄 Källa:** {c['source']}
- **Typ:** {c.get('source_type')}
- **Rubrik:** {c.get('title')}
- **Sidor:** {c.get('pages')}
- **Score:** `{round(score, 4)}`

{c['text'][:500]}{'…' if len(c['text']) > 500 else ''}
---
"""
            )

        debug_md = "\n".join(debug_lines)

    # -----------------------------
    # Slutligt svar
    # -----------------------------
    final_answer = answer + sources_md + debug_md
    return final_answer, "<h3>Svar</h3>"
    
# =====================================================
# UI
# =====================================================

# with gr.Blocks(css=".gradio-container {background-color: white}") as demo:
with gr.Blocks() as demo:
    gr.HTML("<h1 class='title'>Citrus-chatbot</h1>")

    gr.Image(
        value="brain.jpg",
        show_label=False,
        interactive=False,
        elem_classes="brain-header"
    )

    current_doc = gr.State(None)
    
    # -------------------------
    # HUVUDFRÅGOR
    # -------------------------
    with gr.Row():
        main_buttons = []
    
        for doc in DOCUMENTS:
            with gr.Column(elem_classes="card"):
                gr.HTML(
                    f"""
                    <div class="card-content">
                        <div class="card-title">{doc["title"]}</div>
                        <div class="card-question">{doc["main_question"]}</div>
                    </div>
                    """
                )
    
                btn = gr.Button(
                    "",
                    elem_classes="card-overlay"
                )
    
                main_buttons.append((btn, doc["id"]))

    # -------------------------
    # INNEHÅLL
    # -------------------------
    with gr.Row():
        
        # VÄNSTER: Underfrågor
        with gr.Column(scale=2):
            gr.Markdown("<h3>Underfrågor</h3>")
            questions = gr.Dataframe(
                headers=[""],
                interactive=False,
                elem_classes="question-list"
            )
    
        # HÖGER: Meddelande
        with gr.Column(scale=3):
            gr.Markdown("<h3>Meddelande</h3>")
            message = gr.Textbox(
                placeholder="Välj ett område, klicka på en underfråga och tryck på Skicka.",
                lines=1,
                label=None,  
                show_label=False,
                elem_classes="message-box"
            )
    
            with gr.Row():
                send_btn = gr.Button("Skicka", elem_classes="send-btn")
                clear_btn = gr.Button("Rensa", elem_classes="send-btn")
                debug_mode = gr.Checkbox(
                    label="Debug",
                    value=False
                )

    # RAD 2 – Svar över hela bredden
    with gr.Row():
        with gr.Column():
            answer_title = gr.Markdown(
                "<h3>Svar</h3>",
                elem_classes="answer-title"
            )
            
            answer = gr.Markdown(
                "",
                elem_classes="answer-box"
            )
            
    # -------------------------
    # EVENTS
    # -------------------------

    for btn, doc_id in main_buttons:
        btn.click(
            fn=lambda d=doc_id: load_document(d),
            outputs=[questions, current_doc]
        )

    questions.select(
        fn=fill_message,
        outputs=message
    )

    send_btn.click(
        fn=submit,
        inputs=[message, current_doc, debug_mode],
        outputs=[answer, answer_title]
    )
    
    message.submit(
        fn=submit,
        inputs=[message, current_doc, debug_mode],
        outputs=[answer, answer_title]
    )

    clear_btn.click(
        fn=clear_all,
        outputs=[questions, message, answer, current_doc]
    )

# =====================================================
# LAUNCH
# =====================================================

with open("style.css", encoding="utf-8") as f:
    css = f.read()

demo.launch(theme=None,css=css, ssr_mode=False)