require 'sinatra'
require 'json'
require 'nokogiri'
require 'uri'
require 'net/http'
set :port, 4567
set :bind, '0.0.0.0'
# --- helpers ---
def fetch_wiktionary_page(word)
escaped = URI::DEFAULT_PARSER.escape(word)
uri = URI("https://fr.wiktionary.org/w/api.php?action=parse&page=#{escaped}&format=json&prop=text&parserversion=2")
resp = Net::HTTP.get_response(uri)
return nil unless resp.is_a?(Net::HTTPSuccess)
JSON.parse(resp.body)['parse']&.dig('text', '*')
end
def extract_french_data(html)
return nil unless html
# Find the French section: content between the first h2 (Français) and the second h2 (another language)
h2_positions = []
pos = 0
while pos < html.length
idx = html.index('
= 2 ? h2_positions[1] : html.length
section_html = html[start...finish]
# Parse the section
doc = Nokogiri::HTML(section_html)
# Extract IPA pronunciations (top-level in the French section)
ipa_list = []
doc.css('span.API').each do |span|
next unless span['title']&.include?('Prononciation API')
text = span.text.strip
next if text.empty? || text.length < 2
next if text.include?('Annexe:')
next if text.include?('Rimes')
text = text.gsub(/^\\|\\$/, '')
ipa_list << text unless ipa_list.include?(text)
end
# Limit to the most meaningful IPAs (first 3-4 unique ones)
ipa_list = ipa_list.take(4) if ipa_list.length > 4
# Extract definitions: h3 elements with grammatical category names
definitions = []
skip_words = %w[Étymologie Prononciation Anagrammes Voir aussi Références
Synonyme Dérivé Proverbe Variante Hyperonyme Hyponyme
Vocabulaire Traduction Homophone]
doc.css('h3').each do |h3|
heading_text = h3.text.strip.gsub(/\s+/, ' ')
next if heading_text.empty?
next if skip_words.any? { |w| heading_text.include?(w) }
next if heading_text.start_with?('[')
# Find the mw-heading div that wraps this h3
mw_heading = h3.parent
while mw_heading && !mw_heading['class']&.to_s&.include?('mw-heading')
mw_heading = mw_heading.parent
break if mw_heading.nil?
end
next if mw_heading.nil?
# Collect content between mw-heading and next h3
# Only take h4-level sub-sections (Synonymes, etc.) for additional detail
# Stop at the next h3 (new grammatical category)
content_parts = []
node = mw_heading.next_element
while node && node.name != 'h3'
next node = node.next_element if node['class']&.to_s&.include?('mw-editsection')
# Skip the definition table (wikitable with inflection) and IPA rows
if node.name == 'table' && node['class']&.to_s&.include?('wikitable')
# Skip wikitable (usually the inflection table for nouns)
node = node.next_element
next
end
# Skip images
if node.name == 'figure'
node = node.next_element
next
end
# Skip p tags that only contain IPA spans (redundant with section-level IPAs)
if node.name == 'p' && node.css('span.API').count > 0
node = node.next_element
next
end
clean = clean_html(node)
content_parts << clean unless clean.empty?
node = node.next_element
end
content_text = content_parts.join(' ').gsub(/\s+/, ' ').strip
# Trim: take only the first meaningful paragraph
# Split on double spaces or periods followed by space to get first major definition
first_def = content_text[/^.{100,500}?[.!]/]
first_def = content_text[0..500] if first_def.nil? || first_def.length < 100
content_text = first_def.strip
definitions << { heading: heading_text, content: content_text }
end
# Limit to top 2 definitions (main sense first)
definitions = definitions.take(2) if definitions.length > 2
return { ipa: ipa_list, definitions: definitions } if !ipa_list.empty? || !definitions.empty?
nil
end
def clean_html(node)
return '' unless node
text = node.to_html
# Remove mw-editsection spans
text = text.gsub(/]*class="[^"]*mw-editsection[^"]*"[^>]*>.*?<\/span>/m, '')
text = text.gsub(/]*class="[^"]*mw-editsection[^"]*"[^>]*\/>/, '')
# Remove "modifier" links
text = text.gsub(/]*>modifier<\/span>/i, '')
text = text.gsub(/]*>modifier le wikicode<\/span>/i, '')
# Remove lang spans
text = text.gsub(/]*class="[^"]*lang[^"]*"[^>]*>/, '')
text = text.gsub(/<\/span>/, '')
text.strip.gsub(/\s+/, ' ')
end
# --- routes ---
get '/' do
@word = ''
erb :index
end
post '/search' do
word = params['word'].to_s.strip
@word = word
if word.empty?
@error = 'Veuillez entrer un mot.'
return erb :index
end
html = fetch_wiktionary_page(word)
if html.nil?
@error = "Page Wiktionnaire non trouvée pour « #{word} »."
return erb :index
end
result = extract_french_data(html)
if result.nil?
@error = "Aucune définition française trouvée pour « #{word} »."
else
@ipa = result[:ipa]
@definitions = result[:definitions]
end
erb :index
end