Spaces:
Sleeping
Sleeping
| #Import relevant modules | |
| import os | |
| import gradio as gr | |
| import weaviate | |
| from openai import OpenAI | |
| from pypdf import PdfReader | |
| from pathlib import Path | |
| from weaviate.auth import AuthApiKey | |
| from dotenv import load_dotenv | |
| import re | |
| #Setup | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| WEAVIATE_URL = os.getenv("WEAVIATE_URL") | |
| WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY") | |
| print("Testing Weaviate connection...") | |
| print("URL:", WEAVIATE_URL) | |
| print("API KEY:", "SET" if WEAVIATE_API_KEY else "MISSING") | |
| print("OPENAI_API_KEY:", "SET" if OPENAI_API_KEY else "MISSING") | |
| # Connect to Weaviate Cloud | |
| client = weaviate.connect_to_weaviate_cloud( | |
| cluster_url=WEAVIATE_URL, | |
| auth_credentials=AuthApiKey(WEAVIATE_API_KEY), | |
| skip_init_checks=True | |
| ) | |
| openai_client = OpenAI(api_key=OPENAI_API_KEY) | |
| # Load and process PDF | |
| def extract_text_from_pdf(pdf_path): | |
| if not pdf_path or not os.path.exists(pdf_path): | |
| raise ValueError(f"No PDF file provided") | |
| reader = PdfReader(pdf_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| #Chunk the text | |
| def chunk_text(text, chunk_size = 1000, overlap = 200): | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunks.append(text[start:end]) | |
| start += chunk_size - overlap | |
| return chunks | |
| #Weaviate setup | |
| from weaviate.classes.config import DataType | |
| def setup_schema(): | |
| #wipe old collections | |
| client.collections.delete_all() | |
| #create new collection | |
| client.collections.create( | |
| name="PDFChunk", | |
| vectorizer_config=None, | |
| properties=[ | |
| {"name":"text", "data_type":DataType.TEXT}, | |
| {"name":"page", "data_type":DataType.INT} | |
| ] | |
| ) | |
| #Create embeddings and Store in Vector DB | |
| def embed(text): | |
| return openai_client.embeddings.create( | |
| model = "text-embedding-3-large", | |
| input=text | |
| ).data[0].embedding | |
| def insert_chunks(chunks): | |
| pdf_chunks = client.collections.get("PDFChunk") | |
| for i, chunk in enumerate(chunks): | |
| vec = embed(chunk) | |
| pdf_chunks.data.insert( | |
| properties={"text":chunk, "page":i}, | |
| vector=vec | |
| ) | |
| # Querying | |
| def expand_query(query): | |
| try: | |
| prompt = f"""Expand the following short questions into a more detailed search query | |
| that includes synonyms and related HR terms, but also restate the keywords clearly. | |
| Examples: | |
| Q: Who should I contact if I am sick? | |
| Expanded: Who should I notify or contact if I am ill, unwell, or absent due to sickness β such as my Deputy Head or line manager. | |
| Q: What do I do if I am late? | |
| Expanded: What procedure should I follow if I expect to be late, delayed, or absent for work β who must I contact, for example my Deputy Head or line manager? | |
| Now expand this query in the same way: | |
| Q: {query} | |
| Expanded: | |
| """ | |
| response = openai_client.chat.completions.create( | |
| model = "gpt-4.1-mini", | |
| messages = [{"role": "user", "content": prompt}], | |
| temperature=0 | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| print("β οΈ Query expansion failed:", e) | |
| return query | |
| def search_weaviate(query, k=12): | |
| pdf_chunks = client.collections.get("PDFChunk") | |
| expanded_query = expand_query(query) | |
| query_vec = embed(expanded_query) | |
| result = pdf_chunks.query.hybrid( #both lexical and semantic | |
| query=expanded_query, | |
| vector=query_vec, | |
| alpha=0.3, | |
| limit=k, | |
| return_properties=["text", "page"] | |
| ) | |
| filtered_objects = [] | |
| for o in result.objects: | |
| distance = getattr(o.metadata, "distance", None) | |
| certainty = getattr(o.metadata, "certainty", None) | |
| # Keep results above a relevance threshold | |
| if (distance is None or distance < 1.2) or (certainty and certainty >0.3): | |
| filtered_objects.append(o) | |
| return [(o.properties["text"], o.metadata.distance)for o in result.objects] | |
| def rerank_chunks_with_llm(query, chunks): | |
| """ | |
| Rerank retrieved chunks using GPT reasoning. | |
| Returns a list of chunks ordered in descending order | |
| """ | |
| #Build a short reranking prompt | |
| chunk_list_parts = [] | |
| for i, (text, _) in enumerate(chunks): | |
| clean_text = text[:400].strip().replace("\n", " ") | |
| chunk_list_parts.append(f"[{i+1}] {clean_text}...") | |
| chunk_list = "\n\n".join(chunk_list_parts) | |
| rerank_prompt = f""" | |
| You are a precise HR assistant that ranks excerpts | |
| from a staff handbook by how relevant they are to the user's question. | |
| You must rank excerpts that directly answer the user's question higher than those that merely discuss related topics. | |
| Question: {query} | |
| Excerpts: | |
| {chunk_list} | |
| Return only the list of excerpt numbers, separated by commas, in descending order of relevance. | |
| Example: 3, 1, 2 | |
| """ | |
| #Run LLM model | |
| response = openai_client.chat.completions.create( | |
| model="gpt-4.1-mini", | |
| messages=[ | |
| {"role": "system", "content": "You are a factual and consistent reranker."}, | |
| {"role": "user", "content": rerank_prompt} | |
| ], | |
| temperature = 0 | |
| ) | |
| text_output = response.choices[0].message.content.strip() | |
| print(f"π Reranker raw output: {text_output}") # optional | |
| # extract numbers safely | |
| order = [int(x) for x in re.findall(r'\d+', text_output )] | |
| order = [i for i in order if 1 <= i <= len(chunks)] #ensure valid range | |
| # fallback: if model fails to output indices, return original order | |
| if not order: | |
| order = list(range(1, len(chunks) + 1)) | |
| # Return reordered text chunks | |
| ordered_chunks = [chunks[i-1][0] for i in order] | |
| return ordered_chunks | |
| def ask_question(query): | |
| chunks = search_weaviate(query, k=12) | |
| reranked_chunks = rerank_chunks_with_llm(query, chunks) | |
| # Use top three after reranking | |
| context = "\n\n---\n\n".join(reranked_chunks[:4]) | |
| prompt = f""" | |
| You are an HR assistant answering questions from the staff handbook. | |
| Use only the following content to answer accurately and concisely: | |
| {context} | |
| Question: {query} | |
| Answer: | |
| """ | |
| response = openai_client.chat.completions.create( | |
| model="gpt-4.1-mini", | |
| messages=[ | |
| {"role": "system", "content": | |
| "You are a helpful HR assistant. Base your answer only on the handbook excerpts provided. \ | |
| If the information is unclear, infer carefully using HR policies but prefer quoting exact text."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0 | |
| ) | |
| return response.choices[0].message.content.strip() | |
| #Gradio App | |
| def process_pdf(pdf_file): | |
| try: | |
| if not pdf_file: | |
| return "β No file uploaded" | |
| setup_schema() | |
| # pdf_file is already a string path because of type="filepath" | |
| text = extract_text_from_pdf(pdf_file) | |
| chunks = chunk_text(text) | |
| insert_chunks(chunks) | |
| return "β PDF uploaded and indexed! You can now ask questions." | |
| except Exception as e: | |
| import traceback | |
| return f"β Error: {str(e)}\n{traceback.format_exc()}" | |
| def qa_pipeline(question): | |
| return ask_question(question) | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| # Global CSS injected explicitly | |
| gr.HTML(""" | |
| <style> | |
| /* widen overall container */ | |
| .gradio-container { max-width: 1100px !important; margin: auto; } | |
| /* make textareas bigger & full width */ | |
| #qbox textarea { width: 100% !important; min-height: 110px !important; font-size: 16px !important; } | |
| #abox textarea { width: 100% !important; min-height: 220px !important; font-size: 16px !important; } | |
| </style> | |
| """) | |
| gr.Markdown("## π PDF Q&A Bot with Weaviate + OpenAI") | |
| with gr.Tab("Upload PDF"): | |
| pdf_input = gr.File(label="Upload PDF", type="filepath") | |
| upload_btn = gr.Button("Process PDF") | |
| status = gr.Textbox(label="Status") | |
| upload_btn.click(process_pdf, inputs=pdf_input, outputs=status) | |
| with gr.Tab("Ask Questions"): | |
| question = gr.Textbox( | |
| label="Your Question", | |
| elem_id="qbox" # π ID we target in CSS | |
| ) | |
| answer = gr.Textbox( | |
| label="Answer", | |
| elem_id="abox" # π ID we target in CSS | |
| ) | |
| ask_btn = gr.Button("Ask", size="lg") | |
| ask_btn.click(qa_pipeline, inputs=question, outputs=answer) | |
| demo.launch() | |
| client.close() |