require 'sinatra' require 'json' require 'nokogiri' require 'uri' require 'net/http' set :port, 4567 set :bind, '0.0.0.0' # --- helpers --- def fetch_wiktionary_page(word) escaped = URI::DEFAULT_PARSER.escape(word) uri = URI("https://fr.wiktionary.org/w/api.php?action=parse&page=#{escaped}&format=json&prop=text&parserversion=2") resp = Net::HTTP.get_response(uri) return nil unless resp.is_a?(Net::HTTPSuccess) JSON.parse(resp.body)['parse']&.dig('text', '*') end def extract_french_data(html) return nil unless html # Find the French section: content between the first h2 (Français) and the second h2 (another language) h2_positions = [] pos = 0 while pos < html.length idx = html.index('= 2 ? h2_positions[1] : html.length section_html = html[start...finish] # Parse the section doc = Nokogiri::HTML(section_html) # Extract IPA pronunciations (top-level in the French section) ipa_list = [] doc.css('span.API').each do |span| next unless span['title']&.include?('Prononciation API') text = span.text.strip next if text.empty? || text.length < 2 next if text.include?('Annexe:') next if text.include?('Rimes') text = text.gsub(/^\\|\\$/, '') ipa_list << text unless ipa_list.include?(text) end # Limit to the most meaningful IPAs (first 3-4 unique ones) ipa_list = ipa_list.take(4) if ipa_list.length > 4 # Extract definitions: h3 elements with grammatical category names definitions = [] skip_words = %w[Étymologie Prononciation Anagrammes Voir aussi Références Synonyme Dérivé Proverbe Variante Hyperonyme Hyponyme Vocabulaire Traduction Homophone] doc.css('h3').each do |h3| heading_text = h3.text.strip.gsub(/\s+/, ' ') next if heading_text.empty? next if skip_words.any? { |w| heading_text.include?(w) } next if heading_text.start_with?('[') # Find the mw-heading div that wraps this h3 mw_heading = h3.parent while mw_heading && !mw_heading['class']&.to_s&.include?('mw-heading') mw_heading = mw_heading.parent break if mw_heading.nil? end next if mw_heading.nil? # Collect content between mw-heading and next h3 # Only take h4-level sub-sections (Synonymes, etc.) for additional detail # Stop at the next h3 (new grammatical category) content_parts = [] node = mw_heading.next_element while node && node.name != 'h3' next node = node.next_element if node['class']&.to_s&.include?('mw-editsection') # Skip the definition table (wikitable with inflection) and IPA rows if node.name == 'table' && node['class']&.to_s&.include?('wikitable') # Skip wikitable (usually the inflection table for nouns) node = node.next_element next end # Skip images if node.name == 'figure' node = node.next_element next end # Skip p tags that only contain IPA spans (redundant with section-level IPAs) if node.name == 'p' && node.css('span.API').count > 0 node = node.next_element next end clean = clean_html(node) content_parts << clean unless clean.empty? node = node.next_element end content_text = content_parts.join(' ').gsub(/\s+/, ' ').strip # Trim: take only the first meaningful paragraph # Split on double spaces or periods followed by space to get first major definition first_def = content_text[/^.{100,500}?[.!]/] first_def = content_text[0..500] if first_def.nil? || first_def.length < 100 content_text = first_def.strip definitions << { heading: heading_text, content: content_text } end # Limit to top 2 definitions (main sense first) definitions = definitions.take(2) if definitions.length > 2 return { ipa: ipa_list, definitions: definitions } if !ipa_list.empty? || !definitions.empty? nil end def clean_html(node) return '' unless node text = node.to_html # Remove mw-editsection spans text = text.gsub(/]*class="[^"]*mw-editsection[^"]*"[^>]*>.*?<\/span>/m, '') text = text.gsub(/]*class="[^"]*mw-editsection[^"]*"[^>]*\/>/, '') # Remove "modifier" links text = text.gsub(/]*>modifier<\/span>/i, '') text = text.gsub(/]*>modifier le wikicode<\/span>/i, '') # Remove lang spans text = text.gsub(/]*class="[^"]*lang[^"]*"[^>]*>/, '') text = text.gsub(/<\/span>/, '') text.strip.gsub(/\s+/, ' ') end # --- routes --- get '/' do @word = '' erb :index end post '/search' do word = params['word'].to_s.strip @word = word if word.empty? @error = 'Veuillez entrer un mot.' return erb :index end html = fetch_wiktionary_page(word) if html.nil? @error = "Page Wiktionnaire non trouvée pour « #{word} »." return erb :index end result = extract_french_data(html) if result.nil? @error = "Aucune définition française trouvée pour « #{word} »." else @ipa = result[:ipa] @definitions = result[:definitions] end erb :index end