HarisVasilo commited on
Commit
21e7d03
·
verified ·
1 Parent(s): 899c7f6

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
7
+ import torch
8
+ import uuid
9
+ import os
10
+ import langid
11
+
12
+ st.title("RAG PDF Q&A με DeepSeek-7B και Paraphrasing (μόνο Αγγλικά)")
13
+
14
+ st.sidebar.header("Ανέβασε PDF")
15
+ pdf = st.sidebar.file_uploader("Επίλεξε PDF", type="pdf")
16
+
17
+ if pdf:
18
+ temp_filename = f"temp_{uuid.uuid4()}.pdf"
19
+ with open(temp_filename, "wb") as f:
20
+ f.write(pdf.read())
21
+
22
+ loader = PyPDFLoader(temp_filename)
23
+ pages = loader.load()
24
+
25
+ st.sidebar.success(f"Το έγγραφο έχει {len(pages)} σελίδες.")
26
+
27
+ # ---- Φόρτωση DeepSeek ----
28
+ @st.cache_resource
29
+ def load_llm():
30
+ MODEL_ID = "deepseek-ai/deepseek-llm-7b-chat"
31
+
32
+ bnb_config = BitsAndBytesConfig(
33
+ load_in_4bit=True,
34
+ bnb_4bit_use_double_quant=True,
35
+ bnb_4bit_quant_type="nf4",
36
+ bnb_4bit_compute_dtype=torch.float16,
37
+ )
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained(
40
+ MODEL_ID,
41
+ trust_remote_code=True
42
+ )
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ MODEL_ID,
45
+ quantization_config=bnb_config,
46
+ device_map="auto",
47
+ trust_remote_code=True
48
+ )
49
+ return tokenizer, model
50
+
51
+ tokenizer, model = load_llm()
52
+
53
+ # ---- Paraphrasing για Αγγλικά ----
54
+ @st.cache_resource
55
+ def load_en_paraphraser():
56
+ return pipeline("text2text-generation", model="ramsrigouthamg/t5_paraphraser")
57
+
58
+ paraphraser_en = load_en_paraphraser()
59
+
60
+ translation_pipeline = pipeline("translation", model="Helsinki-NLP/opus-mt-en-el")
61
+
62
+ total_words = sum(len(page.page_content.split()) for page in pages)
63
+ avg_words_per_page = total_words / len(pages)
64
+
65
+ st.sidebar.info(f"Μέσος όρος λέξεων ανά σελίδα: {int(avg_words_per_page)}")
66
+
67
+ proposed_chunk_size = 1000
68
+ proposed_overlap = 300
69
+
70
+ user_chunk_size = st.sidebar.number_input("Επίλεξε Chunk size", value=proposed_chunk_size, step=50)
71
+ user_overlap = st.sidebar.number_input("Επίλεξε Overlap", value=proposed_overlap, step=50)
72
+
73
+ text_splitter = RecursiveCharacterTextSplitter(
74
+ separators=["\n\n", "\n", ". ", "! ", "; ", "? ", " ", " "],
75
+ chunk_size=user_chunk_size,
76
+ chunk_overlap=user_overlap
77
+ )
78
+
79
+ docs = text_splitter.split_documents(pages)
80
+
81
+ # ---- Προσθήκη custom ids ----
82
+ for idx, doc in enumerate(docs):
83
+ doc.metadata["custom_id"] = idx
84
+
85
+ st.success(f"Επεξεργάστηκαν {len(docs)} chunks.")
86
+
87
+ embedding_function = HuggingFaceEmbeddings(
88
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
89
+ )
90
+
91
+ # ---- Ανέβηκε νέο PDF; Καθάρισε vectordb ----
92
+ if st.session_state.get("loaded_filename") != temp_filename:
93
+ st.session_state.vectordb = None
94
+ st.session_state.retriever = None
95
+ st.session_state.docs = None
96
+ st.session_state.loaded_filename = temp_filename
97
+
98
+ if not st.session_state.get("vectordb"):
99
+ vectordb = Chroma(
100
+ collection_name=f"rag_pdf_collection_{uuid.uuid4()}",
101
+ embedding_function=embedding_function
102
+ )
103
+ vectordb.add_documents(docs)
104
+ retriever = vectordb.as_retriever(
105
+ search_kwargs={"k": 6}
106
+ )
107
+ st.session_state.vectordb = vectordb
108
+ st.session_state.retriever = retriever
109
+ st.session_state.docs = docs
110
+ else:
111
+ retriever = st.session_state.retriever
112
+ docs = st.session_state.docs
113
+
114
+ st.sidebar.success("Έτοιμο το retriever.")
115
+
116
+ # ---- Rephrasing ----
117
+ def rephrase_question(original_question, lang):
118
+ variations = [original_question]
119
+ paraphrase_prompt = f"paraphrase: {original_question} </s>"
120
+
121
+ try:
122
+ if lang == "el":
123
+ # Δεν υπάρχει διαθέσιμο paraphrasing για ελληνικά προς το παρόν
124
+ rephrases = []
125
+ else:
126
+ output = paraphraser_en(paraphrase_prompt, max_length=64, num_return_sequences=3, do_sample=True)
127
+ rephrases = list({o['generated_text'].strip() for o in output})
128
+ variations.extend(rephrases)
129
+ except Exception as e:
130
+ st.sidebar.warning("Το paraphrasing απέτυχε. Χρησιμοποιούμε μόνο την αρχική ερώτηση.")
131
+
132
+ return variations
133
+
134
+ def generate_answer(question):
135
+ detected_lang = langid.classify(question)[0]
136
+ if detected_lang == "el":
137
+ lang_instruction = "Απάντησε στα Ελληνικά."
138
+ fallback_response = "Δεν γνωρίζω."
139
+ else:
140
+ lang_instruction = "Answer in English."
141
+ fallback_response = "I do not know."
142
+
143
+ variations = rephrase_question(question, detected_lang)
144
+
145
+ st.sidebar.info(f"Εναλλακτικές ερωτήσεις: {variations}")
146
+
147
+ all_docs = []
148
+ for var in variations:
149
+ docs_found = retriever.get_relevant_documents(var)
150
+ all_docs.extend(docs_found)
151
+
152
+ unique_docs = list({doc.page_content: doc for doc in all_docs}.values())
153
+ context = "\n\n".join([doc.page_content for doc in unique_docs])
154
+
155
+ chunk_ids = []
156
+ for doc in unique_docs:
157
+ if "custom_id" in doc.metadata:
158
+ chunk_ids.append(doc.metadata["custom_id"])
159
+
160
+ prompt = f"""
161
+ {lang_instruction}
162
+ Χρησιμοποίησε ΜΟΝΟ τα συμφραζόμενα.
163
+ Αν δεν υπάρχει απάντηση στα συμφραζόμενα, πες ρητά: {fallback_response}
164
+ Συμφραζόμενα:
165
+ {context}
166
+ Ερώτηση: {question}
167
+ Απάντηση:"""
168
+
169
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
170
+ output = model.generate(
171
+ **inputs,
172
+ max_new_tokens=250,
173
+ temperature=0.2,
174
+ repetition_penalty=1.2,
175
+ eos_token_id=tokenizer.eos_token_id,
176
+ )
177
+
178
+ full_answer = tokenizer.decode(output[0], skip_special_tokens=True)
179
+
180
+ if "Απάντηση:" in full_answer:
181
+ clean_answer = full_answer.split("Απάντηση:")[-1].strip()
182
+ else:
183
+ clean_answer = full_answer.strip()
184
+
185
+ if clean_answer == "" or any(
186
+ bad in clean_answer.lower()
187
+ for bad in ["απάντησε", "συμφραζόμενα", question.lower()]
188
+ ):
189
+ clean_answer = fallback_response
190
+
191
+ if detected_lang == "el" and clean_answer != fallback_response:
192
+ if sum(c.isalpha() and c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' for c in clean_answer) > len(clean_answer) * 0.4:
193
+ translation = translation_pipeline(clean_answer)[0]['translation_text']
194
+ clean_answer = translation
195
+
196
+ return clean_answer, chunk_ids
197
+
198
+ question = st.text_input("Γράψε την ερώτησή σου:")
199
+ if question:
200
+ with st.spinner("Ανάκτηση και παραγωγή απάντησης..."):
201
+ answer, chunk_ids = generate_answer(question)
202
+ st.markdown("**Απάντηση:**")
203
+ st.success(answer)
204
+ st.info(f"Χρησιμοποιήθηκαν τα chunks με ID: {chunk_ids}")
205
+
206
+ os.remove(temp_filename)
207
+
208
+ else:
209
+ st.info("Περιμένω να ανεβάσεις ένα PDF.")