File size: 7,722 Bytes
21e7d03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977750d
 
21e7d03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import streamlit as st
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
import uuid
import os
import langid

st.title("RAG PDF Q&A με DeepSeek-7B και Paraphrasing (μόνο Αγγλικά)")

st.sidebar.header("Ανέβασε PDF")
pdf = st.sidebar.file_uploader("Επίλεξε PDF", type="pdf")

if pdf:
    temp_filename = f"temp_{uuid.uuid4()}.pdf"
    with open(temp_filename, "wb") as f:
        f.write(pdf.read())

    loader = PyPDFLoader(temp_filename)
    pages = loader.load()

    st.sidebar.success(f"Το έγγραφο έχει {len(pages)} σελίδες.")

    # ---- Φόρτωση DeepSeek ----
    @st.cache_resource
    def load_llm():
        MODEL_ID = "deepseek-ai/deepseek-llm-7b-chat"

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )

        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_ID,
            trust_remote_code=True
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True
        )
        return tokenizer, model

    tokenizer, model = load_llm()

    # ---- Paraphrasing για Αγγλικά ----
    @st.cache_resource
    def load_en_paraphraser():
        return pipeline("text2text-generation", model="ramsrigouthamg/t5_paraphraser")

    paraphraser_en = load_en_paraphraser()

    translation_pipeline = pipeline("translation", model="Helsinki-NLP/opus-mt-en-el")

    total_words = sum(len(page.page_content.split()) for page in pages)
    avg_words_per_page = total_words / len(pages)

    st.sidebar.info(f"Μέσος όρος λέξεων ανά σελίδα: {int(avg_words_per_page)}")

    proposed_chunk_size = 1000
    proposed_overlap = 300

    user_chunk_size = st.sidebar.number_input("Επίλεξε Chunk size", value=proposed_chunk_size, step=50)
    user_overlap = st.sidebar.number_input("Επίλεξε Overlap", value=proposed_overlap, step=50)

    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", "! ", "; ", "? ", "  ", " "],
        chunk_size=user_chunk_size,
        chunk_overlap=user_overlap
    )

    docs = text_splitter.split_documents(pages)

    # ---- Προσθήκη custom ids ----
    for idx, doc in enumerate(docs):
        doc.metadata["custom_id"] = idx

    st.success(f"Επεξεργάστηκαν {len(docs)} chunks.")

    embedding_function = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    # ---- Ανέβηκε νέο PDF; Καθάρισε vectordb ----
    if st.session_state.get("loaded_filename") != temp_filename:
        st.session_state.vectordb = None
        st.session_state.retriever = None
        st.session_state.docs = None
        st.session_state.loaded_filename = temp_filename

    if not st.session_state.get("vectordb"):
        vectordb = Chroma(
            collection_name=f"rag_pdf_collection_{uuid.uuid4()}",
            embedding_function=embedding_function
        )
        vectordb.add_documents(docs)
        retriever = vectordb.as_retriever(
            search_kwargs={"k": 6}
        )
        st.session_state.vectordb = vectordb
        st.session_state.retriever = retriever
        st.session_state.docs = docs
    else:
        retriever = st.session_state.retriever
        docs = st.session_state.docs

    st.sidebar.success("Έτοιμο το retriever.")

    # ---- Rephrasing ----
    def rephrase_question(original_question, lang):
        variations = [original_question]
        paraphrase_prompt = f"paraphrase: {original_question} </s>"

        try:
            if lang == "el":
                # Δεν υπάρχει διαθέσιμο paraphrasing για ελληνικά προς το παρόν
                rephrases = []
            else:
                output = paraphraser_en(paraphrase_prompt, max_length=64, num_return_sequences=3, do_sample=True)
                rephrases = list({o['generated_text'].strip() for o in output})
            variations.extend(rephrases)
        except Exception as e:
            st.sidebar.warning("Το paraphrasing απέτυχε. Χρησιμοποιούμε μόνο την αρχική ερώτηση.")

        return variations

    def generate_answer(question):
        detected_lang = langid.classify(question)[0]
        if detected_lang == "el":
            lang_instruction = "Απάντησε στα Ελληνικά."
            fallback_response = "Δεν γνωρίζω."
        else:
            lang_instruction = "Answer in English."
            fallback_response = "I do not know."

        variations = rephrase_question(question, detected_lang)

        st.sidebar.info(f"Εναλλακτικές ερωτήσεις: {variations}")

        all_docs = []
        for var in variations:
            docs_found = retriever.get_relevant_documents(var)
            all_docs.extend(docs_found)

        unique_docs = list({doc.page_content: doc for doc in all_docs}.values())
        context = "\n\n".join([doc.page_content for doc in unique_docs])

        chunk_ids = []
        for doc in unique_docs:
            if "custom_id" in doc.metadata:
                chunk_ids.append(doc.metadata["custom_id"])

        prompt = f"""
{lang_instruction}
Χρησιμοποίησε ΜΟΝΟ τα συμφραζόμενα.
Αν δεν υπάρχει απάντηση στα συμφραζόμενα, πες ρητά: {fallback_response}
Συμφραζόμενα:
{context}
Ερώτηση: {question}
Απάντηση:"""

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        output = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.3,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,
        )

        full_answer = tokenizer.decode(output[0], skip_special_tokens=True)

        if "Απάντηση:" in full_answer:
            clean_answer = full_answer.split("Απάντηση:")[-1].strip()
        else:
            clean_answer = full_answer.strip()

        if clean_answer == "" or any(
            bad in clean_answer.lower()
            for bad in ["απάντησε", "συμφραζόμενα", question.lower()]
        ):
            clean_answer = fallback_response

        if detected_lang == "el" and clean_answer != fallback_response:
            if sum(c.isalpha() and c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' for c in clean_answer) > len(clean_answer) * 0.4:
                translation = translation_pipeline(clean_answer)[0]['translation_text']
                clean_answer = translation

        return clean_answer, chunk_ids

    question = st.text_input("Γράψε την ερώτησή σου:")
    if question:
        with st.spinner("Ανάκτηση και παραγωγή απάντησης..."):
            answer, chunk_ids = generate_answer(question)
            st.markdown("**Απάντηση:**")
            st.success(answer)
            st.info(f"Χρησιμοποιήθηκαν τα chunks με ID: {chunk_ids}")

    os.remove(temp_filename)

else:
    st.info("Περιμένω να ανεβάσεις ένα PDF.")