import json import base64 import os import shutil import time import gradio as gr from rag.search import search from rag.prompts import rag_prompt from llm.reasoning import generate_reasoning from llm.reasoning import generate_reasoning_from_prompt from rag.ingest import ingest_pdfs_and_web, save_chunks print("🔄 Startar RAG-ingest") DATA_DIR = "rag/data" start_time = time.perf_counter() chunks = ingest_pdfs_and_web() save_chunks(chunks, out_dir=DATA_DIR) elapsed = time.perf_counter() - start_time print(f"✅ Ingest klar – {len(chunks)} chunkar skapade") print(f"⏱ Ingest-tid: {elapsed:.2f} sekunder") print("HF_TOKEN present:", bool(os.getenv("HF_TOKEN"))) print("HF_TOKEN length:", len(os.getenv("HF_TOKEN", ""))) # ===================================================== # DATA # ===================================================== with open("content.json", encoding="utf-8") as f: DOCUMENTS = json.load(f)["documents"] DOC_INDEX = {d["id"]: d for d in DOCUMENTS} PUBLIC_DIR = "/tmp/gradio/public_pdfs" os.makedirs(PUBLIC_DIR, exist_ok=True) for file in os.listdir("rag/files"): if file.lower().endswith(".pdf"): shutil.copy( os.path.join("rag/files", file), os.path.join(PUBLIC_DIR, file) ) # ===================================================== # FUNKTIONER # ===================================================== def load_document(doc_id): rows = [[q["question"]] for q in DOC_INDEX[doc_id]["subquestions"]] return rows, doc_id def fill_message(evt: gr.SelectData): value = evt.value if isinstance(value, list): return value[0] return value def submit(message, doc_id, debug_mode): """ Central router: - Om message matchar en underfrĂ„ga → vanlig Q&A - Annars → RAG över PDF-material """ message = message.strip() if not message: return "", "

Svar

" # 1ïžâƒŁ Försök matcha mot valt dokument (klassisk vĂ€g) if doc_id and doc_id in DOC_INDEX: doc = DOC_INDEX[doc_id] for q in doc["subquestions"]: if q["question"] == message: fact_answer = format_answer(q["answer"]) reasoning = generate_reasoning( title=doc["title"], main_question=doc["main_question"], question=message, answer=q["answer"] ) combined = ( "### Svar\n\n" + fact_answer + "\n\n---\n\n" + "### Resonemang\n\n" + reasoning ) return combined, "

Svar

" # 2ïžâƒŁ Ingen match → RAG-fritext return handle_rag_query(message, debug_mode) def format_answer(answer): out = [] for key, value in answer.items(): out.append(f"**{key}**") if isinstance(value, list): for item in value: out.append(f"- {item}") else: out.append(value) out.append("") return "\n".join(out) def clear_all(): return [], "", "", None def format_pages(pages): if not pages: return "" pages = sorted(set(pages)) if len(pages) == 1: return f"s. {pages[0]}" # sammanhĂ€ngande intervall if pages[-1] - pages[0] + 1 == len(pages): return f"s. {pages[0]}–{pages[-1]}" return "s. " + ", ".join(str(p) for p in pages) def format_source_link(chunk: dict) -> str: source = chunk.get("source", "OkĂ€nd kĂ€lla") source_type = chunk.get("source_type") pages = chunk.get("pages") if source_type == "pdf": page_info = format_pages(pages) return ( f"📄 " f"[{source}](" f"https://raw.githubusercontent.com/" f"tomashelmfridsson/systeminforande/main/{source}" f")" f"{' — ' + page_info if page_info else ''}" ) if source_type == "web": return f"🌐 [{source}]({source})" return source def handle_rag_query(query: str, debug: bool): results = search(query, top_k=5) if not results: return ( "Det finns inget tillrĂ€ckligt underlag i materialet för att besvara frĂ„gan.", "

Svar

" ) # ----------------------------- # Confidence score # ----------------------------- scores = [score for score, _ in results] confidence = round(sum(scores) / len(scores), 2) chunks = [chunk for _, chunk in results] # ----------------------------- # Generera svar # ----------------------------- prompt = rag_prompt(query=query, chunks=chunks) answer = generate_reasoning_from_prompt(prompt) # ----------------------------- # Bygg anvĂ€nda kĂ€llor (VIKTIGT: DEFINIERAS HÄR) # ----------------------------- used_sources = {} for _, c in results: used_sources[c["source"]] = c # ----------------------------- # KĂ€llor (visas alltid) # ----------------------------- sources_lines = ["\n\n---\n\n### KĂ€llor"] for c in used_sources.values(): sources_lines.append(f"- {format_source_link(c)}") sources_md = "\n".join(sources_lines) # ----------------------------- # Debug (valfritt) # ----------------------------- debug_md = "" if debug: debug_lines = [ "\n\n---\n\n### Debug", f"**Confidence:** {confidence}", "" ] for score, c in results: debug_lines.append( f"""**📄 KĂ€lla:** {c['source']} - **Typ:** {c.get('source_type')} - **Rubrik:** {c.get('title')} - **Sidor:** {c.get('pages')} - **Score:** `{round(score, 4)}` {c['text'][:500]}{'
' if len(c['text']) > 500 else ''} --- """ ) debug_md = "\n".join(debug_lines) # ----------------------------- # Slutligt svar # ----------------------------- final_answer = answer + sources_md + debug_md return final_answer, "

Svar

" # ===================================================== # UI # ===================================================== # with gr.Blocks(css=".gradio-container {background-color: white}") as demo: with gr.Blocks() as demo: gr.HTML("

Citrus-chatbot

") gr.Image( value="brain.jpg", show_label=False, interactive=False, elem_classes="brain-header" ) current_doc = gr.State(None) # ------------------------- # HUVUDFRÅGOR # ------------------------- with gr.Row(): main_buttons = [] for doc in DOCUMENTS: with gr.Column(elem_classes="card"): gr.HTML( f"""
{doc["title"]}
{doc["main_question"]}
""" ) btn = gr.Button( "", elem_classes="card-overlay" ) main_buttons.append((btn, doc["id"])) # ------------------------- # INNEHÅLL # ------------------------- with gr.Row(): # VÄNSTER: UnderfrĂ„gor with gr.Column(scale=2): gr.Markdown("

UnderfrÄgor

") questions = gr.Dataframe( headers=[""], interactive=False, elem_classes="question-list" ) # HÖGER: Meddelande with gr.Column(scale=3): gr.Markdown("

Meddelande

") message = gr.Textbox( placeholder="VĂ€lj ett omrĂ„de, klicka pĂ„ en underfrĂ„ga och tryck pĂ„ Skicka.", lines=1, label=None, show_label=False, elem_classes="message-box" ) with gr.Row(): send_btn = gr.Button("Skicka", elem_classes="send-btn") clear_btn = gr.Button("Rensa", elem_classes="send-btn") debug_mode = gr.Checkbox( label="Debug", value=False ) # RAD 2 – Svar över hela bredden with gr.Row(): with gr.Column(): answer_title = gr.Markdown( "

Svar

", elem_classes="answer-title" ) answer = gr.Markdown( "", elem_classes="answer-box" ) # ------------------------- # EVENTS # ------------------------- for btn, doc_id in main_buttons: btn.click( fn=lambda d=doc_id: load_document(d), outputs=[questions, current_doc] ) questions.select( fn=fill_message, outputs=message ) send_btn.click( fn=submit, inputs=[message, current_doc, debug_mode], outputs=[answer, answer_title] ) message.submit( fn=submit, inputs=[message, current_doc, debug_mode], outputs=[answer, answer_title] ) clear_btn.click( fn=clear_all, outputs=[questions, message, answer, current_doc] ) # ===================================================== # LAUNCH # ===================================================== with open("style.css", encoding="utf-8") as f: css = f.read() demo.launch(theme=None,css=css, ssr_mode=False)