<?php
/**
 * AJAX Endpoint for Memorandum Processing
 * 
 * Processes uploaded memorandum files using the robust document processor
 * to ensure consistent text extraction on both teacher and student sides.
 */

// Start output buffering to catch any unexpected output
ob_start();

// Suppress warnings that could break JSON output
error_reporting(E_ERROR | E_PARSE);
ini_set('display_errors', 0);

require_once '../../includes/session_start.php';
require_once '../../config/db_connect.php';
require_once '../../includes/robust_document_processor.php';

// Clean any unexpected output
ob_clean();

// Set JSON content type
header('Content-Type: application/json');

// Check authentication
if (!isset($_SESSION['user_id']) || $_SESSION['user_role'] !== 'teacher') {
    echo json_encode(['error' => 'Unauthorized access']);
    exit;
}

try {
    // Get JSON input for base64 data
    $input = json_decode(file_get_contents('php://input'), true);
    
    // Debug: Log what we're receiving
    error_log('AJAX Debug - JSON input: ' . json_encode(array_merge($input ?? [], ['file_data' => '[BASE64_DATA_' . strlen($input['file_data'] ?? '') . '_CHARS]'])));
    error_log('AJAX Debug - Content-Type: ' . ($_SERVER['CONTENT_TYPE'] ?? 'not set'));
    
    if (!$input) {
        throw new Exception('Invalid JSON input');
    }

    $fileName = $input['name'] ?? '';
    $fileType = $input['type'] ?? '';
    $fileSize = $input['size'] ?? 0;
    $extractedText = $input['extractedText'] ?? '';
    $fileContent = $input['content'] ?? ''; // Base64 content as fallback

    if (empty($fileName)) {
        throw new Exception('Missing file name');
    }

    error_log("Processing memorandum: {$fileName} ({$fileType}, {$fileSize} bytes)");

    $processedText = '';
    $method = 'unknown';

    // If we have pre-extracted text from JavaScript (PDF.js), use it
    if (!empty($extractedText) && $fileType === 'application/pdf') {
        error_log("Using pre-extracted text from PDF.js: " . strlen($extractedText) . " characters");
        $processedText = $extractedText;
        $method = 'PDF.js (JavaScript)';
    } else if (!empty($fileContent)) {
        // Fallback: process file content using PHP methods for non-PDF files
        error_log("Using PHP fallback processing for: " . $fileType);
    
        // Decode base64 content
        $decodedContent = base64_decode($fileContent);
        if ($decodedContent === false) {
            throw new Exception('Failed to decode file content');
        }
        
        // Create temporary file
        $tempDir = '../../uploads/temp/';
        if (!is_dir($tempDir)) {
            mkdir($tempDir, 0755, true);
        }
        
        $tempFile = $tempDir . uniqid('memo_') . '_' . $fileName;
        file_put_contents($tempFile, $decodedContent);
        
        // Process using PHP robust processor
        require_once '../../includes/robust_document_processor.php';
        $processor = new RobustDocumentProcessor(true);
        
        try {
            $processedText = $processor->extractText($tempFile);
            $method = 'PHP Robust Processor';
        } catch (Exception $e) {
            error_log('PHP processing failed: ' . $e->getMessage());
            throw new Exception('Failed to process document: ' . $e->getMessage());
        } finally {
            // Clean up temporary file
            if (file_exists($tempFile)) {
                unlink($tempFile);
            }
        }
    } else {
        throw new Exception('No file content or extracted text provided');
    }
    
    if (empty($processedText)) {
        throw new Exception('No readable text could be extracted from the document');
    }
    
    // Apply additional cleaning and validation for business content
    $finalText = cleanBusinessText($processedText);
    
    error_log("Text processing successful via {$method}: " . strlen($finalText) . ' characters');
    
    // Simple text analysis
    $wordCount = str_word_count($finalText);
    $sentenceCount = preg_match_all('/[.!?]+/', $finalText);
    $paragraphCount = max(1, substr_count($finalText, "\n"));
    $charCount = strlen($finalText);
    
    // Return structured response
    echo json_encode([
        'success' => true,
        'extracted_text' => $finalText,
        'debug_info' => "Method: {$method}\nOriginal length: " . strlen($processedText) . "\nCleaned length: {$charCount}",
        'analysis' => [
            'words' => $wordCount,
            'sentences' => $sentenceCount,
            'sections' => 0,
            'paragraphs' => $paragraphCount,
            'characters' => $charCount
        ],
        'file_info' => [
            'name' => $fileName,
            'type' => $fileType,
            'size' => $fileSize,
            'processing_method' => $method
        ]
    ]);
    
} catch (Exception $e) {
    error_log('AJAX Error: ' . $e->getMessage());
    echo json_encode([
        'success' => false,
        'error' => $e->getMessage()
    ]);
}

/**
 * Clean text specifically for business documents
 */
function cleanBusinessText($text) {
    // Check if text is mostly binary/encoded data
    $alphaCount = preg_match_all('/[a-zA-Z]/', $text);
    $totalLength = strlen($text);
    $alphaRatio = $totalLength > 0 ? ($alphaCount / $totalLength) : 0;
    
    // If less than 40% alphabetic, it's likely encoded/binary data
    if ($alphaRatio < 0.4) {
        error_log("Text appears to be encoded/binary (alpha ratio: {$alphaRatio}), rejecting: " . substr($text, 0, 100));
        return "[PDF contains encoded/compressed text - could not extract readable content. Please try converting to Word format or ensure the PDF contains selectable text.]"; 
    }
    
    // Check for specific encoded patterns
    if (preg_match('/^[A-Za-z0-9:]+\s+[A-Za-z0-9]+\s+[A-Za-z0-9]+/', $text) && str_word_count($text) < 10) {
        error_log("Text appears to be PDF metadata/encoding: " . $text);
        return "[PDF appears to contain encoded text or metadata rather than readable content. Please check if the PDF has selectable text.]"; 
    }
    
    // Remove excessive whitespace
    $text = preg_replace('/\s+/', ' ', $text);
    
    // Remove PDF artifacts that might still be present
    $text = preg_replace('/\b(endobj|stream|obj|Filter|FlateDecode)\b/i', '', $text);
    
    // Normalize punctuation
    $text = str_replace(['  ', '   '], ' ', $text);
    
    return trim($text);
}

// End output buffering and flush
ob_end_flush();

/**
 * Structure memorandum content similar to JavaScript version
 */
function structureMemorandumContent($text) {
    // CRITICAL FIX: Convert HTML entities and clean up formatting
    $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
    $text = strip_tags($text); // Remove HTML tags
    $text = str_replace(['<br>', '<br/>', '<br />', '</p>', '</div>', '</section>'], "\n", $text);
    $text = preg_replace('/\s+/', ' ', $text); // Normalize whitespace
    $text = str_replace('. ', ".\n", $text); // Split sentences to new lines
    $text = preg_replace('/\d+\.\s*/', "\n$0", $text); // Put numbered items on new lines
    
    $lines = array_filter(explode("\n", $text), function($line) {
        return !empty(trim($line));
    });
    
    $keyPoints = [];
    $answers = [];
    $currentSection = '';
    
    foreach ($lines as $line) {
        $line = trim($line);
        
        // Identify questions/sections with IMPROVED pattern recognition
        if (preg_match('/^(\d+\.|\w+\s*\d*\s*:|Chapter|Task|Assignment|Learning|Assessment|Background|Instructions|Scenario|Case|Question|Q\d+|Part|Section|Essay|Brief|Criteria|Outcomes|Information|Objective|Goal|Requirement|Answer|Solution|Example|Note|Important|Summary)/i', $line)) {
            $currentSection = $line;
            $keyPoints[] = $line;
        } elseif (strlen($line) > 20) {
            // Likely an answer or key point
            $answers[] = [
                'section' => $currentSection,
                'content' => $line
            ];
        }
    }
    
    return [
        'full_text' => $text,
        'key_points' => $keyPoints,
        'answers' => $answers,
        'word_count' => str_word_count($text),
        'structure_analysis' => [
            'sentences' => count(preg_split('/[.!?]+/', $text)) - 1,
            'paragraphs' => max(1, count(array_filter(preg_split('/\n\s*\n/', $text), function($p) { return !empty(trim($p)); }))),
            'sections' => count($keyPoints),
            'characters' => strlen($text)
        ]
    ];
}
?>
