/**
 * Helper function to read a file as ArrayBuffer
 */
function readFileAsArrayBuffer(file: File): Promise<ArrayBuffer> {
  return new Promise((resolve, reject) => {
    const reader = new FileReader();
    reader.onload = () => resolve(reader.result as ArrayBuffer);
    reader.onerror = reject;
    reader.readAsArrayBuffer(file);
  });
}

/**
 * Helper function to convert an ArrayBuffer to string
 */
function arrayBufferToString(buffer: Uint8Array): string {
  // Using TextDecoder if available (modern browsers)
  if (typeof TextDecoder !== 'undefined') {
    return new TextDecoder('utf-8').decode(buffer);
  }

  // Fallback approach
  let result = '';
  for (let i = 0; i < buffer.length; i++) {
    result += String.fromCharCode(buffer[i]);
  }
  return result;
}

/**
 * Ultra simple approach to detect if a PDF is scanned
 * Works by analyzing PDF binary content for text markers
 * No dependencies, no DOM manipulation, just pure TypeScript
 *
 * @param file - The PDF file to analyze
 * @returns Promise<boolean> - True if the PDF is likely scanned, false otherwise
 */
export async function isScannedPdf(file: File): Promise<boolean> {
  try {
    // Read the file as ArrayBuffer
    const arrayBuffer = await readFileAsArrayBuffer(file);
    const bytes = new Uint8Array(arrayBuffer);

    // Check for PDF magic number
    if (bytes[0] !== 37 || bytes[1] !== 80 || bytes[2] !== 68 || bytes[3] !== 70) {
      throw new Error('Not a valid PDF file');
    }

    // Look at a larger portion of the PDF to improve detection
    // First examine the beginning (up to 20KB)
    const beginningSize = Math.min(bytes.length, 20000);
    const beginningChunk = bytes.slice(0, beginningSize);

    // Also examine a middle portion for better detection
    const middleStart = Math.floor(bytes.length / 2) - 10000;
    const middleSize = Math.min(20000, bytes.length - middleStart);
    const middleChunk = middleStart > 0 ? bytes.slice(middleStart, middleStart + middleSize) : new Uint8Array(0);

    // Combine both for analysis
    const combinedSize = beginningSize + middleSize;
    const combinedChunk = new Uint8Array(combinedSize);
    combinedChunk.set(beginningChunk, 0);
    if (middleStart > 0) {
      combinedChunk.set(middleChunk, beginningSize);
    }

    const pdfString = arrayBufferToString(combinedChunk);

    // Direct check for string objects with actual content (more reliable)
    const stringMatches = pdfString.match(/\(([\w\s,.;:'\-!@#$%^&*()]{3,})\)/g);
    const meaningfulTextCount = stringMatches ? stringMatches.length : 0;

    // Look for fonts - absence of fonts often indicates a scanned document
    const hasFonts = pdfString.includes('/Font') &&
      (pdfString.includes('/BaseFont') || pdfString.includes('/FontName'));

    // Look for image operations and large images (common in scanned PDFs)
    const imageKeywords = ['/Image', '/XObject', '/DCTDecode', '/JPXDecode', '/CCITTFaxDecode'];
    let hasImages = false;
    for (const keyword of imageKeywords) {
      if (pdfString.includes(keyword)) {
        hasImages = true;
        break;
      }
    }

    // Check for large image dimensions (typical of scanned pages)
    const widthMatches = pdfString.match(/\/Width\s+(\d+)/g);
    const heightMatches = pdfString.match(/\/Height\s+(\d+)/g);

    let hasLargeImages = false;
    if (widthMatches && heightMatches && widthMatches.length > 0 && heightMatches.length > 0) {
      for (let i = 0; i < Math.min(widthMatches.length, heightMatches.length); i++) {
        const widthStr = widthMatches[i].replace('/Width', '').trim();
        const heightStr = heightMatches[i].replace('/Height', '').trim();

        const width = parseInt(widthStr, 10);
        const height = parseInt(heightStr, 10);

        // Consider images over 1000x1000 to be large (likely scanned pages)
        if (!isNaN(width) && !isNaN(height) && width > 1000 && height > 800) {
          hasLargeImages = true;
          break;
        }
      }
    }
    // Combine image indicators
    hasImages = hasImages || hasLargeImages;

    // 1. If there are virtually no meaningful text strings, but images exist, it's scanned
    // Very low threshold for clearly scanned documents
    if (meaningfulTextCount < 3 && hasImages) {
      // console.log('Detected as scanned: Almost no text content but has images');
      return true;
    }

    // 2. If there are no fonts defined but images exist, it's likely scanned
    if (!hasFonts && hasImages) {
      // console.log('Detected as scanned: No fonts defined but has images');
      return true;
    }

    // 3. Very few text instances per page in a document with multiple pages
    const pageMatches = pdfString.match(/\/Type\s*\/Page/g);
    const estimatedPageCount = Math.max(1, pageMatches ? pageMatches.length : 1);
    const textPerPage = meaningfulTextCount / estimatedPageCount;

    // Adjust threshold - typical scanned PDFs have very few text strings per page
    // If there's less than 10 text strings per page on average, likely scanned
    if (textPerPage < 10 && hasImages) {
      // console.log('Detected as scanned: Low text per page ratio', textPerPage);
      return true;
    }

    // 4. Additional check: If there are large images (typical of scanned pages)
    // and relatively low text, it's likely a scanned document
    if (hasLargeImages && textPerPage < 20) {
      // console.log('Detected as scanned: Has large images with relatively low text');
      return true;
    }

    // Otherwise, it's likely a normal PDF with text content
    return false;

    // eslint-disable-next-line @typescript-eslint/no-unused-vars
  } catch (error) {
    // console.error('Error analyzing PDF:', error);
    // In case of errors, we can't tell, so default to false
    return false;
  }
}
