Spaces:

jaczad
/

Szuflada

Sleeping

App Files Files Community

jaczad commited on Aug 18, 2025

Commit

fa696d9

1 Parent(s): 6960e35

Linki już działają

Browse files

Files changed (2) hide show

app.py +163 -59
scrap.py +175 -0

app.py CHANGED Viewed

@@ -1,64 +1,168 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
 )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import uuid
+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.chat_history import BaseChatMessageHistory
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.runnables.history import RunnableWithMessageHistory
+# --- 1. Inicjalizacja modeli i retrievera ---
+llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
+embedder = OpenAIEmbeddings(model="text-embedding-3-small")
+baza = Chroma(
+    collection_name="szuflada",
+    embedding_function=embedder,
+    persist_directory="./szuflada"
+)
+# Możliwe typy wyszukiwania w retrieverze Chroma:
+# - "similarity" (domyślne, bez progu)
+# - "mmr" (Maximal Marginal Relevance)
+# - "similarity_score_threshold" (z progiem score_threshold)
+# Przykład zmiany na standardowe wyszukiwanie podobieństw:
+retriever = baza.as_retriever(
+    search_type="similarity",  # <- zmień na "similarity" lub "mmr" jeśli chcesz
+    search_kwargs={"k": 5}
+)
+# --- 2. Tworzenie łańcucha RAG z historią ---
+contextualize_q_system_prompt = (
+    "Biorąc pod uwagę historię czatu i ostatnie pytanie użytkownika, "
+    "które może odnosić się do kontekstu w historii czatu, "
+    "sformułuj samodzielne pytanie, które można zrozumieć bez historii czatu. "
+    "NIE odpowiadaj na pytanie, po prostu przeformułuj je, jeśli to konieczne, "
+    "a w przeciwnym razie zwróć je w niezmienionej formie."
+)
+contextualize_q_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", contextualize_q_system_prompt),
+        MessagesPlaceholder("chat_history"),
+        ("human", "{input}"),
+    ]
+)
+history_aware_retriever = create_history_aware_retriever(
+    llm, retriever, contextualize_q_prompt
+)
+qa_system_prompt = (
+    "Jesteś asystentem do zadawania pytań i odpowiedzi na temat treści ze strony mojaszuflada.pl. "
+    "Użyj poniższych fragmentów odzyskanego kontekstu, aby odpowiedzieć na pytanie. "
+    "Odpowiadaj zawsze w języku polskim. "
+    "Jeśli nie znasz odpowiedzi, po prostu powiedz, że nie wiesz. "
+    "Zachowaj zwięzłość odpowiedzi, ale bądź pomocny i przyjazny."
+    "\n\n{context}"
+)
+qa_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", qa_system_prompt),
+        MessagesPlaceholder("chat_history"),
+        ("human", "{input}"),
+    ]
+)
+question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
+rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
+store = {}
+def get_session_history(session_id: str) -> BaseChatMessageHistory:
+    if session_id not in store:
+        store[session_id] = ChatMessageHistory()
+    return store[session_id]
+conversational_rag_chain = RunnableWithMessageHistory(
+    rag_chain,
+    get_session_history,
+    input_messages_key="input",
+    history_messages_key="chat_history",
+    output_messages_key="answer",
 )
+# --- 3. Funkcje pomocnicze dla Gradio ---
+def format_sources(source_docs):
+    if not source_docs:
+        return "_Brak źródeł do wyświetlenia._"
+    sources = []
+    for doc in source_docs:
+        metadata = doc.metadata
+        title = metadata.get("title", "Brak tytułu")
+        source_url = metadata.get("source", "Brak URL")
+        pub_date_raw = metadata.get("published_time")
+        if pub_date_raw:
+            pub_date = pub_date_raw.split("T")[0]
+            sources.append(f"- [{title}]({source_url}) ({pub_date})")
+        else:
+            sources.append(f"- [{title}]({source_url})")
+    # Dodaj informację o liczbie chunków (debug)
+    sources.append(f"\n_Znaleziono chunków: {len(source_docs)}_")
+    return "\n".join(sources)
+# --- 4. Budowa interfejsu Gradio ---
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Szuflada Chatbot") as demo:
+    session_id = gr.State(lambda: str(uuid.uuid4()))
+    gr.Markdown(
+        """# Czat z Szufladą\n### Zadaj pytanie na temat treści ze strony [mojaszuflada.pl](https://mojaszuflada.pl)
+        """
+    )
+    chatbot = gr.Chatbot(
+        label="Rozmowa",
+        height=500,
+    )
+    with gr.Row():
+        msg = gr.Textbox(
+            show_label=False,
+            placeholder="Wpisz swoje pytanie...",
+            container=False,
+            scale=7,
+        )
+        submit_btn = gr.Button("Wyślij", variant="primary", scale=1)
+    def respond(message, chat_history, sess_id):
+        response = conversational_rag_chain.invoke(
+            {"input": message},
+            config={"configurable": {"session_id": sess_id}},
+        )
+        # Dodaj logowanie score dla debugowania
+        context_docs = response.get("context", [])
+        for i, doc in enumerate(context_docs):
+            score = doc.metadata.get("score", "brak score")
+            print(f"Chunk {i+1}: score={score}, title={doc.metadata.get('title')}")
+        sources_md = format_sources(context_docs)
+        answer_with_sources = response["answer"] + "\n\n**Źródła:**\n" + sources_md
+        chat_history.append((message, answer_with_sources))
+        return chat_history
+    submit_btn.click(
+        respond,
+        [msg, chatbot, session_id],
+        [chatbot]
+    ).then(lambda: gr.update(value=""), None, [msg], queue=False)
+    msg.submit(
+        respond,
+        [msg, chatbot, session_id],
+        [chatbot]
+    ).then(lambda: gr.update(value=""), None, [msg], queue=False)
+# --- 5. Uruchomienie aplikacji ---
 if __name__ == "__main__":
+    # --- TEST: sprawdź bezpośrednio retriever ---
+    test_query = "test"  # <- wpisz tu frazę, która powinna być w bazie
+    print("\n=== TEST RETRIEVER ===")
+    docs = retriever.get_relevant_documents(test_query)
+    print(f"Znaleziono {len(docs)} dokumentów dla zapytania: '{test_query}'")
+    for i, doc in enumerate(docs):
+        score = doc.metadata.get("score", "brak score")
+        print(f"Chunk {i+1}: score={score}, title={doc.metadata.get('title')}, source={doc.metadata.get('source')}")
+    print("=== KONIEC TESTU ===\n")
+    # --- KONIEC TESTU ---
+    demo.launch(inbrowser=True)

scrap.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from langchain_community.document_loaders import SitemapLoader
+from bs4 import BeautifulSoup
+import re
+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+import requests
+from tqdm import tqdm
+def process_documents(docs: list[Document]) -> list[Document]:
+    """
+    Przetwarza listę dokumentów, wyodrębniając treść i metadane z HTML.
+    """
+    processed_docs = []
+    for doc in docs:
+        soup = BeautifulSoup(doc.page_content, "lxml")
+        # Wyodrębnienie głównej treści
+        article = soup.find("article")
+        if article:
+            content = article.get_text(separator="\n", strip=True)
+        else:
+            content = soup.get_text(separator="\n", strip=True)
+        # Wyodrębnienie metadanych
+        metadata = doc.metadata.copy() # Kopiujemy istniejące metadane (np. source)
+        # Title: Zgodnie z sugestią, tytuł jest pobierany tylko ze znacznika <title>
+        if soup.title:
+            title_text = soup.title.get_text(strip=True)
+            if title_text:
+                metadata["title"] = title_text
+        # Data publikacji
+        # Published time: prefer meta[property=article:published_time], then <time>, then regex search
+        pub_date_tag = soup.find("meta", property="article:published_time")
+        if pub_date_tag and pub_date_tag.get("content"):
+            metadata["published_time"] = pub_date_tag["content"]
+        else:
+            time_tag = soup.find("time")
+            if time_tag and time_tag.get("datetime"):
+                metadata["published_time"] = time_tag.get("datetime")
+            elif time_tag and time_tag.get_text(strip=True):
+                metadata["published_time"] = time_tag.get_text(strip=True)
+            else:
+                # Polish pages often have 'Opublikowano w dniu 8 marca 2011' as plain text
+                text = soup.get_text(separator="\n", strip=True)
+                m = re.search(r"Opublikowano(?: w dniu)?[:\s]+([0-9]{1,2}\s+\w+\s+\d{4})", text, re.IGNORECASE)
+                if m:
+                    metadata["published_time"] = m.group(1)
+        # Kategorie
+        categories = [
+            tag["content"]
+            for tag in soup.find_all("meta", property="article:section")
+            if tag.get("content")
+        ]
+        if categories:
+            metadata["categories"] = ", ".join(categories)
+        # Słowa kluczowe (tagi)
+        keywords = [
+            tag["content"]
+            for tag in soup.find_all("meta", property="article:tag")
+            if tag.get("content")
+        ]
+        if keywords:
+            metadata["keywords"] = ", ".join(keywords)
+        processed_docs.append(Document(page_content=content, metadata=metadata))
+    return processed_docs
+embedder=OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)
+baza=Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory="./szuflada")
+# --- DODANA SEKCJA ---
+# Czyszczenie istniejącej kolekcji przed dodaniem nowych danych
+# To zapewnia, że pracujemy na świeżych danych z metadanymi.
+print("Czyszczenie istniejącej kolekcji w bazie danych...")
+try:
+    baza.delete_collection()
+    print("Kolekcja została wyczyszczona.")
+    # Po usunięciu kolekcji, musimy ponownie zainicjować obiekt Chroma
+    baza=Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory="./szuflada")
+except Exception as e:
+    print(f"Nie można było wyczyścić kolekcji (może nie istniała): {e}")
+# --- KONIEC DODANEJ SEKCJI ---
+# --- Nowa logika ładowania danych ---
+print("Pobieranie i parsowanie mapy strony...")
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
+sitemap_url = "https://mojaszuflada.pl/wp-sitemap.xml"
+docs = []
+try:
+    response = requests.get(sitemap_url, headers=headers)
+    response.raise_for_status()
+    sitemap_xml = response.text
+    sitemap_soup = BeautifulSoup(sitemap_xml, "xml")
+    urls = [loc.text for loc in sitemap_soup.find_all("loc")]
+    sitemap_urls = [url for url in urls if url.endswith(".xml")]
+    page_urls = [url for url in urls if not url.endswith(".xml")]
+    for sub_sitemap_url in tqdm(sitemap_urls, desc="Parsowanie pod-map"):
+        try:
+            response = requests.get(sub_sitemap_url, headers=headers)
+            response.raise_for_status()
+            sub_sitemap_xml = response.text
+            sub_sitemap_soup = BeautifulSoup(sub_sitemap_xml, "xml")
+            page_urls.extend([loc.text for loc in sub_sitemap_soup.find_all("loc")])
+        except requests.RequestException as e:
+            print(f"Pominięto pod-mapę {sub_sitemap_url}: {e}")
+    print(f"Znaleziono {len(page_urls)} adresów URL do przetworzenia.")
+    for url in tqdm(page_urls, desc="Pobieranie stron"):
+        try:
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            doc = Document(
+                page_content=response.text,
+                metadata={"source": url, "loc": url}
+            )
+            docs.append(doc)
+        except requests.RequestException as e:
+            print(f"Pominięto stronę {url}: {e}")
+except requests.RequestException as e:
+    print(f"Krytyczny błąd: Nie udało się pobrać głównej mapy strony: {e}")
+    # docs will be empty and the script will exit gracefully later
+if not docs:
+    print("Nie załadowano żadnych dokumentów. Zakończenie pracy.")
+    exit()
+processed_docs = process_documents(docs)
+print("\nPrzykładowe metadane przetworzonych dokumentów (pierwsze 5):")
+for pd in processed_docs[:5]:
+    print(pd.metadata)
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+chunks = text_splitter.split_documents(processed_docs)
+batch_size = 1000
+# --- WALIDACJA METADANYCH DLA CHUNKÓW ---
+# Sprawdzamy, czy każdy chunk zawiera oczekiwane metadane (źródło, tytuł, data publikacji)
+required_meta_keys = ["source", "title", "published_time"]
+missing_counts = {k: 0 for k in required_meta_keys}
+for idx, chunk in enumerate(chunks):
+    md = chunk.metadata or {}
+    for k in required_meta_keys:
+        if not md.get(k):
+            missing_counts[k] += 1
+print(f"Liczba chunków: {len(chunks)}")
+print("Braki metadanych (liczba chunków bez klucza/wartości):", missing_counts)
+print("Przykładowe metadane dla pierwszych 5 chunków:")
+for sample in chunks[:5]:
+    print(sample.metadata)
+# --- KONIEC WALIDACJI ---
+for i in range(0, len(chunks), batch_size):
+    baza.add_documents(documents=chunks[i:i + batch_size])