require 'sinatra'
require 'json'
require 'nokogiri'
require 'uri'
require 'net/http'

set :port, 4567
set :bind, '0.0.0.0'

# --- helpers ---

def fetch_wiktionary_page(word)
  escaped = URI::DEFAULT_PARSER.escape(word)
  uri = URI("https://fr.wiktionary.org/w/api.php?action=parse&page=#{escaped}&format=json&prop=text&parserversion=2")
  resp = Net::HTTP.get_response(uri)
  return nil unless resp.is_a?(Net::HTTPSuccess)
  JSON.parse(resp.body)['parse']&.dig('text', '*')
end

def extract_french_data(html)
  return nil unless html

  # Find the French section: content between the first h2 (Français) and the second h2 (another language)
  h2_positions = []
  pos = 0
  while pos < html.length
    idx = html.index('<h2', pos)
    break unless idx
    h2_positions << idx
    pos = idx + 3
  end

  return nil if h2_positions.empty?

  start = h2_positions[0]
  finish = h2_positions.length >= 2 ? h2_positions[1] : html.length
  section_html = html[start...finish]

  # Parse the section
  doc = Nokogiri::HTML(section_html)

  # Extract IPA pronunciations (top-level in the French section)
  ipa_list = []
  doc.css('span.API').each do |span|
    next unless span['title']&.include?('Prononciation API')
    text = span.text.strip
    next if text.empty? || text.length < 2
    next if text.include?('Annexe:')
    next if text.include?('Rimes')
    text = text.gsub(/^\\|\\$/, '')
    ipa_list << text unless ipa_list.include?(text)
  end
  # Limit to the most meaningful IPAs (first 3-4 unique ones)
  ipa_list = ipa_list.take(4) if ipa_list.length > 4

  # Extract definitions: h3 elements with grammatical category names
  definitions = []
  skip_words = %w[Étymologie Prononciation Anagrammes Voir aussi Références
                  Synonyme Dérivé Proverbe Variante Hyperonyme Hyponyme
                  Vocabulaire Traduction Homophone]

  doc.css('h3').each do |h3|
    heading_text = h3.text.strip.gsub(/\s+/, ' ')
    next if heading_text.empty?
    next if skip_words.any? { |w| heading_text.include?(w) }
    next if heading_text.start_with?('[')

    # Find the mw-heading div that wraps this h3
    mw_heading = h3.parent
    while mw_heading && !mw_heading['class']&.to_s&.include?('mw-heading')
      mw_heading = mw_heading.parent
      break if mw_heading.nil?
    end
    next if mw_heading.nil?

    # Collect content between mw-heading and next h3
    # Only take h4-level sub-sections (Synonymes, etc.) for additional detail
    # Stop at the next h3 (new grammatical category)
    content_parts = []
    node = mw_heading.next_element
    while node && node.name != 'h3'
      next node = node.next_element if node['class']&.to_s&.include?('mw-editsection')

      # Skip the definition table (wikitable with inflection) and IPA rows
      if node.name == 'table' && node['class']&.to_s&.include?('wikitable')
        # Skip wikitable (usually the inflection table for nouns)
        node = node.next_element
        next
      end

      # Skip images
      if node.name == 'figure'
        node = node.next_element
        next
      end

      # Skip p tags that only contain IPA spans (redundant with section-level IPAs)
      if node.name == 'p' && node.css('span.API').count > 0
        node = node.next_element
        next
      end

      clean = clean_html(node)
      content_parts << clean unless clean.empty?
      node = node.next_element
    end

    content_text = content_parts.join(' ').gsub(/\s+/, ' ').strip

    # Trim: take only the first meaningful paragraph
    # Split on double spaces or periods followed by space to get first major definition
    first_def = content_text[/^.{100,500}?[.!]/]
    first_def = content_text[0..500] if first_def.nil? || first_def.length < 100
    content_text = first_def.strip

    definitions << { heading: heading_text, content: content_text }
  end
  # Limit to top 2 definitions (main sense first)
  definitions = definitions.take(2) if definitions.length > 2

  return { ipa: ipa_list, definitions: definitions } if !ipa_list.empty? || !definitions.empty?
  nil
end

def clean_html(node)
  return '' unless node
  text = node.to_html
  # Remove mw-editsection spans
  text = text.gsub(/<span[^>]*class="[^"]*mw-editsection[^"]*"[^>]*>.*?<\/span>/m, '')
  text = text.gsub(/<span[^>]*class="[^"]*mw-editsection[^"]*"[^>]*\/>/, '')
  # Remove "modifier" links
  text = text.gsub(/<span[^>]*>modifier<\/span>/i, '')
  text = text.gsub(/<span[^>]*>modifier le wikicode<\/span>/i, '')
  # Remove lang spans
  text = text.gsub(/<span[^>]*class="[^"]*lang[^"]*"[^>]*>/, '')
  text = text.gsub(/<\/span>/, '')
  text.strip.gsub(/\s+/, ' ')
end

# --- routes ---

get '/' do
  @word = ''
  erb :index
end

post '/search' do
  word = params['word'].to_s.strip
  @word = word

  if word.empty?
    @error = 'Veuillez entrer un mot.'
    return erb :index
  end

  html = fetch_wiktionary_page(word)

  if html.nil?
    @error = "Page Wiktionnaire non trouvée pour « #{word} »."
    return erb :index
  end

  result = extract_french_data(html)

  if result.nil?
    @error = "Aucune définition française trouvée pour « #{word} »."
  else
    @ipa = result[:ipa]
    @definitions = result[:definitions]
  end

  erb :index
end