File size: 3,155 Bytes
2d4da20 b524645 2d4da20 40cedea 2d4da20 b524645 2d4da20 b524645 2d4da20 b524645 2d4da20 b524645 2d4da20 b524645 2d4da20 b524645 2d4da20 40cedea 2d4da20 b524645 2d4da20 b524645 2d4da20 b524645 40cedea b524645 2d4da20 b524645 2d4da20 b524645 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
import gradio as gr
from langchain.document_loaders import PyPDFLoader, YoutubeLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import init_chat_model
# --- API KEY HANDLING ---
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("openai")
if not OPENAI_API_KEY:
raise ValueError("β OPENAI API Key not found. Please add it in Hugging Face secrets as 'OPENAI_API_KEY' or 'openai'.")
# --- PROCESSING FUNCTION ---
def process_inputs(pdf_file, youtube_url, txt_file, query):
docs = []
# Load PDF
try:
pdf_path = pdf_file.name
pdf_loader = PyPDFLoader(pdf_path)
docs.extend(pdf_loader.load())
except Exception as e:
return f"β Failed to load PDF: {e}"
# Load YouTube Transcript (optional)
yt_loaded = False
if youtube_url:
try:
yt_loader = YoutubeLoader.from_youtube_url(youtube_url, add_video_info=False)
docs.extend(yt_loader.load())
yt_loaded = True
except Exception as e:
print(f"β οΈ YouTube transcript not loaded: {e}")
# Load text transcript file (optional fallback)
if not yt_loaded and txt_file is not None:
try:
txt_path = txt_file.name
txt_loader = TextLoader(txt_path)
docs.extend(txt_loader.load())
except Exception as e:
return f"β Failed to load transcript file: {e}"
if not docs:
return "β No documents could be loaded. Please check your inputs."
# Split text into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
splits = splitter.split_documents(docs)
# Embed documents
embedding = OpenAIEmbeddings(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
db = FAISS.from_documents(splits, embedding)
# Query using RetrievalQA
llm = init_chat_model("gpt-4o-mini", model_provider="openai", api_key=OPENAI_API_KEY)
qa = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever())
try:
result = qa.invoke({"query": query})
return result["result"]
except Exception as e:
return f"β Retrieval failed: {e}"
# --- GRADIO UI ---
with gr.Blocks() as demo:
gr.Markdown("## π Ask Questions from PDF + YouTube Transcript or .txt Upload")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
yt_input = gr.Textbox(label="YouTube URL (Optional)", placeholder="https://www.youtube.com/watch?v=...")
txt_input = gr.File(label="Upload Transcript .txt (Optional fallback)", file_types=[".txt"])
query_input = gr.Textbox(label="Your Question", placeholder="e.g., What did the document say about X?")
output = gr.Textbox(label="Answer")
run_button = gr.Button("Get Answer")
run_button.click(fn=process_inputs, inputs=[pdf_input, yt_input, txt_input, query_input], outputs=output)
if __name__ == "__main__":
demo.launch()
|