Spaces:
Runtime error
Runtime error
| import os | |
| import gradio as gr | |
| from openai import OpenAI | |
| import weaviate | |
| from weaviate.classes.init import Auth | |
| import pypdf # Replaced PyPDF2 | |
| import docx | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from dotenv import load_dotenv | |
| from prompt_template import ( | |
| Prompt_template_translation, | |
| Prompt_template_LLM_Generation, | |
| Prompt_template_Reranker, | |
| Prompt_template_Wisal, | |
| Prompt_template_Halluciations, | |
| Prompt_template_paraphrasing, | |
| Prompt_template_Translate_to_original, | |
| Prompt_template_relevance | |
| ) | |
| from query_utils import process_query_for_rewrite, get_non_autism_response | |
| # ─── Configuration ───────────────────────────────────────────────────────────── | |
| # helper functions | |
| GEMINI_API_KEY="AIzaSyCUCivstFpC9pq_jMHMYdlPrmh9Bx97dFo" | |
| TAVILY_API_KEY="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" | |
| OPENAI_API_KEY="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" | |
| QDRANT_API_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzUxMDUxNzg4fQ.I9J-K7OM0BtcNKgj2d4uVM8QYAHYfFCVAyP4rlZkK2E" | |
| QDRANT_URL="https://6a3aade6-e8ad-4a6c-a579-21f5af90b7e8.us-east4-0.gcp.cloud.qdrant.io" | |
| OPENAI_API_KEY="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" | |
| WEAVIATE_URL="yorcqe2sqswhcaivxvt9a.c0.us-west3.gcp.weaviate.cloud" | |
| WEAVIATE_API_KEY="d2d0VGdZQTBmdTFlOWdDZl9tT2h3WDVWd1NpT1dQWHdGK0xjR1hYeWxicUxHVnFRazRUSjY2VlRUVlkwPV92MjAw" | |
| DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4" | |
| DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai" | |
| openai = OpenAI( | |
| api_key=DEEPINFRA_API_KEY, | |
| base_url="https://api.deepinfra.com/v1/openai", | |
| ) | |
| # Initialize Weaviate client | |
| client = weaviate.connect_to_weaviate_cloud( | |
| cluster_url=WEAVIATE_URL, | |
| auth_credentials=Auth.api_key(WEAVIATE_API_KEY), | |
| skip_init_checks=True, # <-- This disables gRPC check | |
| ) | |
| # ─── Utility: Extract raw text ────────────────────────────────────────────────── | |
| def extract_text(file_path: str) -> str: | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext == ".pdf": | |
| text = "" | |
| with open(file_path, "rb") as f: | |
| reader = pypdf.PdfReader(f) | |
| for page in reader.pages: | |
| page_text = page.extract_text() or "" | |
| text += page_text + "\n" | |
| elif ext == ".docx": | |
| doc = docx.Document(file_path) | |
| text = "\n".join(p.text for p in doc.paragraphs) | |
| elif ext == ".txt": | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| else: | |
| raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.") | |
| return text | |
| # ─── Chunker & Embed ────────────────────────────────────────────────────────── | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| separators=["\n\n", "\n", " "], | |
| ) | |
| def embed_texts(texts: list[str], batch_size: int = 50) -> list[list[float]]: | |
| """Embed texts in batches to avoid API limits.""" | |
| all_embeddings = [] | |
| for i in range(0, len(texts), batch_size): | |
| batch = texts[i:i + batch_size] | |
| resp = openai.embeddings.create( | |
| model="Qwen/Qwen3-Embedding-8B", | |
| input=batch, | |
| encoding_format="float" | |
| ) | |
| all_embeddings.extend([item.embedding for item in resp.data]) | |
| return all_embeddings | |
| # ─── Ingest & Index ─────────────────────────────────────────────────────────── | |
| def ingest_file(file_path: str) -> str: | |
| raw = extract_text(file_path) | |
| docs = splitter.split_text(raw) | |
| texts = [chunk for chunk in docs] | |
| vectors = embed_texts(texts) | |
| # Get the collection | |
| documents = client.collections.get("Books") | |
| # Batch insert with new API | |
| with client.batch.dynamic() as batch: | |
| for txt, vec in zip(texts, vectors): | |
| batch.add_object( | |
| collection="Books", | |
| properties={"text": txt}, | |
| vector=vec | |
| ) | |
| return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}" | |
| # ─── Query & Answer ─────────────────────────────────────────────────────────── | |
| def answer_question(question: str) -> str: | |
| # Process query for rewriting and relevance checking | |
| corrected_query, is_autism_related, rewritten_query = process_query_for_rewrite(question) | |
| # If not autism-related, show direct rejection message | |
| if not is_autism_related: | |
| return get_non_autism_response() | |
| # Use the corrected query for retrieval | |
| q_vec = embed_texts([corrected_query])[0] | |
| documents = client.collections.get("Books") | |
| response = documents.query.near_vector( | |
| near_vector=q_vec, | |
| limit=5, | |
| return_metadata=["distance"] | |
| ) | |
| hits = response.objects | |
| context = "\n\n".join(hit.properties["text"] for hit in hits) | |
| print(context) | |
| wisal_prompt = Prompt_template_Wisal.format(new_query=corrected_query, document=context) | |
| chat = openai.chat.completions.create( | |
| model="Qwen/Qwen3-32B", | |
| messages=[ | |
| {"role": "user", "content": wisal_prompt | |
| } | |
| ], | |
| temperature=0, | |
| reasoning_effort="none" | |
| ) | |
| initial_answer = chat.choices[0].message.content | |
| # NEW: Check if the generated answer is sufficiently related to autism | |
| from query_utils import check_answer_autism_relevance, get_non_autism_answer_response | |
| answer_relevance_score = check_answer_autism_relevance(initial_answer) | |
| # If answer relevance is below 50%, refuse the answer (updated threshold for enhanced scoring) | |
| if answer_relevance_score < 50: | |
| return get_non_autism_answer_response() | |
| # If sufficiently autism-related, return the answer | |
| return initial_answer | |
| # ─── Gradio Interface ───────────────────────────────────────────────────────── | |
| with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo: | |
| gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!") | |
| with gr.Row(): | |
| up = gr.File(label="Select document") | |
| btn = gr.Button("Ingest") | |
| out = gr.Textbox(label="Status", interactive=False) | |
| btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out) | |
| with gr.Row(): | |
| q = gr.Textbox(placeholder="Your question...", lines=2) | |
| ask = gr.Button("Ask") | |
| ans = gr.Textbox(label="Answer", lines=6, interactive=False) | |
| ask.click(fn=answer_question, inputs=q, outputs=ans) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) |