Spaces:
Sleeping
Sleeping
| import os | |
| from cerebras.cloud.sdk import Cerebras | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| import gradio as gr | |
| Cerekey = os.getenv("LitReview") | |
| # Initialize Cerebras AI client with the API key | |
| client = Cerebras(api_key = Cerekey) | |
| def extract_text_from_file(file): | |
| """Extracts text from uploaded PDF or DOCX files.""" | |
| if file.name.endswith(".pdf"): | |
| reader = PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| elif file.name.endswith(".docx"): | |
| doc = Document(file) | |
| text = "\n".join([p.text for p in doc.paragraphs]) | |
| return text | |
| else: | |
| return "Unsupported file format. Please upload a PDF or DOCX file." | |
| def chunk_text(text, max_tokens=4000): | |
| """ | |
| Splits text into chunks small enough for the Llama model to process. | |
| Each chunk is limited to `max_tokens` for safe processing. | |
| """ | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| for word in words: | |
| current_chunk.append(word) | |
| if len(" ".join(current_chunk)) > max_tokens: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [] | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def analyze_chunk(chunk): | |
| """ | |
| Analyzes a single chunk of text using the Cerebras Llama model. | |
| """ | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are an experienced scholar tasked with analyzing research articles. " | |
| "Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; " | |
| "Problem addressed; Methodology (datasets, tools, techniques, algorithms); " | |
| "Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). " | |
| "Summarize only insights related to these fields and disregard irrelevant content." | |
| ) | |
| }, | |
| { | |
| "role": "user", | |
| "content": chunk | |
| } | |
| ] | |
| try: | |
| # Use Cerebras AI for processing | |
| stream = client.chat.completions.create( | |
| messages=messages, | |
| model="llama-3.3-70b", | |
| stream=True, | |
| max_completion_tokens=1024, | |
| temperature=0.2, | |
| top_p=1 | |
| ) | |
| result = "" | |
| for chunk in stream: | |
| result += chunk.choices[0].delta.content or "" | |
| return result | |
| except Exception as e: | |
| return f"An error occurred while processing a chunk: {e}" | |
| def save_as_docx(content): | |
| """Generates and saves a DOCX file.""" | |
| document = Document() | |
| document.add_heading("Literature Analysis", level=1) | |
| document.add_paragraph(content) | |
| file_path = "Literature_Analysis.docx" | |
| document.save(file_path) | |
| return file_path | |
| def analyze_document(file): | |
| """Processes and analyzes the uploaded document.""" | |
| text = extract_text_from_file(file) | |
| if text.startswith("Unsupported file format"): | |
| yield f"**Error:** {text}" | |
| return | |
| chunks = chunk_text(text) | |
| all_insights = [] | |
| yield "**Processing the document. Please wait...**\n" | |
| for i, chunk in enumerate(chunks, 1): | |
| yield f"**Processing chunk {i} of {len(chunks)}...**" | |
| result = analyze_chunk(chunk) | |
| if result.strip(): # Only append non-empty results | |
| all_insights.append(result) | |
| if not all_insights: | |
| yield "**Error:** No valid insights were extracted from the document." | |
| return | |
| yield "**Consolidating all insights into a final summary...**" | |
| consolidated_summary_prompt = ( | |
| "Below are insights extracted from multiple chunks of a document. " | |
| "Consolidate these insights into a single output organized as follows: " | |
| "Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. " | |
| "Make the final output concise and coherent." | |
| ) | |
| try: | |
| stream = client.chat.completions.create( | |
| messages=[ | |
| {"role": "system", "content": consolidated_summary_prompt}, | |
| {"role": "user", "content": "\n\n".join(all_insights)} | |
| ], | |
| model="llama-3.3-70b", | |
| stream=True, | |
| max_completion_tokens=1024, | |
| temperature=0.2, | |
| top_p=1 | |
| ) | |
| final_summary = "" | |
| for chunk in stream: | |
| final_summary += chunk.choices[0].delta.content or "" | |
| yield f"**Final Summary:**\n\n{final_summary}" | |
| except Exception as e: | |
| yield f"**Error:** An error occurred during consolidation: {e}" | |
| # Generate DOCX file after processing | |
| docx_file = save_as_docx(final_summary) | |
| return progress_output, docx_file | |
| except Exception as e: | |
| return f"**Error:** An error occurred during consolidation: {e}", None | |
| # Define the Gradio interface | |
| interface = gr.Interface( | |
| fn= analyze_document, | |
| inputs=gr.File(label="Upload a PDF or DOCX file"), | |
| outputs=gr.Markdown(label="Progress and Analysis"), | |
| title="Automated Literature Review", | |
| description=( | |
| "Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. " | |
| "It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files." | |
| ), | |
| ) | |
| # Launch the interface | |
| if __name__ == "__main__": | |
| interface.launch() |