ChatBot / app.py
HarisVasilo's picture
Update app.py
977750d verified
import streamlit as st
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
import uuid
import os
import langid
st.title("RAG PDF Q&A με DeepSeek-7B και Paraphrasing (μόνο Αγγλικά)")
st.sidebar.header("Ανέβασε PDF")
pdf = st.sidebar.file_uploader("Επίλεξε PDF", type="pdf")
if pdf:
temp_filename = f"temp_{uuid.uuid4()}.pdf"
with open(temp_filename, "wb") as f:
f.write(pdf.read())
loader = PyPDFLoader(temp_filename)
pages = loader.load()
st.sidebar.success(f"Το έγγραφο έχει {len(pages)} σελίδες.")
# ---- Φόρτωση DeepSeek ----
@st.cache_resource
def load_llm():
MODEL_ID = "deepseek-ai/deepseek-llm-7b-chat"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
)
return tokenizer, model
tokenizer, model = load_llm()
# ---- Paraphrasing για Αγγλικά ----
@st.cache_resource
def load_en_paraphraser():
return pipeline("text2text-generation", model="ramsrigouthamg/t5_paraphraser")
paraphraser_en = load_en_paraphraser()
translation_pipeline = pipeline("translation", model="Helsinki-NLP/opus-mt-en-el")
total_words = sum(len(page.page_content.split()) for page in pages)
avg_words_per_page = total_words / len(pages)
st.sidebar.info(f"Μέσος όρος λέξεων ανά σελίδα: {int(avg_words_per_page)}")
proposed_chunk_size = 1000
proposed_overlap = 300
user_chunk_size = st.sidebar.number_input("Επίλεξε Chunk size", value=proposed_chunk_size, step=50)
user_overlap = st.sidebar.number_input("Επίλεξε Overlap", value=proposed_overlap, step=50)
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", ". ", "! ", "; ", "? ", " ", " "],
chunk_size=user_chunk_size,
chunk_overlap=user_overlap
)
docs = text_splitter.split_documents(pages)
# ---- Προσθήκη custom ids ----
for idx, doc in enumerate(docs):
doc.metadata["custom_id"] = idx
st.success(f"Επεξεργάστηκαν {len(docs)} chunks.")
embedding_function = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# ---- Ανέβηκε νέο PDF; Καθάρισε vectordb ----
if st.session_state.get("loaded_filename") != temp_filename:
st.session_state.vectordb = None
st.session_state.retriever = None
st.session_state.docs = None
st.session_state.loaded_filename = temp_filename
if not st.session_state.get("vectordb"):
vectordb = Chroma(
collection_name=f"rag_pdf_collection_{uuid.uuid4()}",
embedding_function=embedding_function
)
vectordb.add_documents(docs)
retriever = vectordb.as_retriever(
search_kwargs={"k": 6}
)
st.session_state.vectordb = vectordb
st.session_state.retriever = retriever
st.session_state.docs = docs
else:
retriever = st.session_state.retriever
docs = st.session_state.docs
st.sidebar.success("Έτοιμο το retriever.")
# ---- Rephrasing ----
def rephrase_question(original_question, lang):
variations = [original_question]
paraphrase_prompt = f"paraphrase: {original_question} </s>"
try:
if lang == "el":
# Δεν υπάρχει διαθέσιμο paraphrasing για ελληνικά προς το παρόν
rephrases = []
else:
output = paraphraser_en(paraphrase_prompt, max_length=64, num_return_sequences=3, do_sample=True)
rephrases = list({o['generated_text'].strip() for o in output})
variations.extend(rephrases)
except Exception as e:
st.sidebar.warning("Το paraphrasing απέτυχε. Χρησιμοποιούμε μόνο την αρχική ερώτηση.")
return variations
def generate_answer(question):
detected_lang = langid.classify(question)[0]
if detected_lang == "el":
lang_instruction = "Απάντησε στα Ελληνικά."
fallback_response = "Δεν γνωρίζω."
else:
lang_instruction = "Answer in English."
fallback_response = "I do not know."
variations = rephrase_question(question, detected_lang)
st.sidebar.info(f"Εναλλακτικές ερωτήσεις: {variations}")
all_docs = []
for var in variations:
docs_found = retriever.get_relevant_documents(var)
all_docs.extend(docs_found)
unique_docs = list({doc.page_content: doc for doc in all_docs}.values())
context = "\n\n".join([doc.page_content for doc in unique_docs])
chunk_ids = []
for doc in unique_docs:
if "custom_id" in doc.metadata:
chunk_ids.append(doc.metadata["custom_id"])
prompt = f"""
{lang_instruction}
Χρησιμοποίησε ΜΟΝΟ τα συμφραζόμενα.
Αν δεν υπάρχει απάντηση στα συμφραζόμενα, πες ρητά: {fallback_response}
Συμφραζόμενα:
{context}
Ερώτηση: {question}
Απάντηση:"""
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
output = model.generate(
**inputs,
max_new_tokens=500,
temperature=0.3,
repetition_penalty=1.2,
eos_token_id=tokenizer.eos_token_id,
)
full_answer = tokenizer.decode(output[0], skip_special_tokens=True)
if "Απάντηση:" in full_answer:
clean_answer = full_answer.split("Απάντηση:")[-1].strip()
else:
clean_answer = full_answer.strip()
if clean_answer == "" or any(
bad in clean_answer.lower()
for bad in ["απάντησε", "συμφραζόμενα", question.lower()]
):
clean_answer = fallback_response
if detected_lang == "el" and clean_answer != fallback_response:
if sum(c.isalpha() and c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' for c in clean_answer) > len(clean_answer) * 0.4:
translation = translation_pipeline(clean_answer)[0]['translation_text']
clean_answer = translation
return clean_answer, chunk_ids
question = st.text_input("Γράψε την ερώτησή σου:")
if question:
with st.spinner("Ανάκτηση και παραγωγή απάντησης..."):
answer, chunk_ids = generate_answer(question)
st.markdown("**Απάντηση:**")
st.success(answer)
st.info(f"Χρησιμοποιήθηκαν τα chunks με ID: {chunk_ids}")
os.remove(temp_filename)
else:
st.info("Περιμένω να ανεβάσεις ένα PDF.")