Spaces:
Runtime error
Runtime error
| import logging | |
| import os | |
| import fitz | |
| from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| pipe = pipeline("text2text-generation", model="google-t5/t5-base", device="cpu") | |
| pipe.model.config.pad_token_id = pipe.tokenizer.eos_token_id | |
| fine_tuned_model_path = "./fine_tuned_model" | |
| fine_tuned_model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_path) | |
| fine_tuned_tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path) | |
| def extract_text_from_pdf(pdf_path): | |
| try: | |
| if not os.path.exists(pdf_path): | |
| raise FileNotFoundError(f"PDF file '{pdf_path}' does not exist.") | |
| # PDF dosyasından metni çıkar | |
| document = fitz.open(pdf_path) | |
| text = "" | |
| for page_num in range(document.page_count): | |
| page = document.load_page(page_num) | |
| text += page.get_text("text") | |
| print(f"Text extraction successful from {pdf_path}.") | |
| return text | |
| except FileNotFoundError as e: | |
| print(f"Error: {e}") | |
| raise e | |
| except Exception as e: | |
| print(f"An error occurred while extracting text from PDF: {e}") | |
| raise e | |
| def split_text_into_chunks(text, chunk_size=1000): | |
| words = text.split() | |
| chunks = [] | |
| for i in range(0, len(words), chunk_size): | |
| chunk = ' '.join(words[i:i+chunk_size]) | |
| chunks.append(chunk) | |
| return chunks | |
| def batch_process_texts(texts, batch_size=2): | |
| batched_results = [] | |
| for i in range(0, len(texts), batch_size): | |
| batch = texts[i:i+batch_size] | |
| try: | |
| combined_text = " ".join(batch) | |
| processed_text = some_processing_function(combined_text) | |
| batched_results.append(processed_text) | |
| except Exception as e: | |
| print(f"Error processing batch {i // batch_size + 1}: {e}") | |
| continue | |
| return batched_results | |
| def generate_lesson_from_chunks(chunks): | |
| generated_texts = batch_process_texts(chunks) | |
| return ' '.join(generated_texts) | |
| def generate_lesson_from_transcript(doc_text): | |
| try: | |
| logger.info("Generating lesson from transcript using general model.") | |
| generated_text = pipe(doc_text, max_length=100, truncation=True)[0]['generated_text'] | |
| output_path = "/tmp/generated_output.txt" | |
| with open(output_path, "w") as file: | |
| file.write(generated_text) | |
| logger.info(f"Lesson generation successful. Output saved at: {output_path}") | |
| return generated_text, output_path | |
| except Exception as e: | |
| logger.error(f"Error occurred during lesson generation: {str(e)}") | |
| return "An error occurred", None | |
| def refine_with_fine_tuned_model(general_output): | |
| try: | |
| logger.info("Refining the output with fine-tuned model.") | |
| prompt = "Refine the following text for teaching purposes: " + general_output | |
| inputs = fine_tuned_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| output_ids = fine_tuned_model.generate( | |
| inputs["input_ids"], | |
| max_length=300, | |
| no_repeat_ngram_size=3, | |
| early_stopping=True | |
| ) | |
| refined_text = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| return refined_text | |
| except Exception as e: | |
| logger.error(f"Error during refinement with fine-tuned model: {str(e)}") | |
| return "An error occurred during refinement." | |