#!/usr/bin/perl -w
# spelling -- lintian check script

# Look for common spelling errors in the package description and the
# copyright file.

# Copyright (C) 1998 by Richard Braakman
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, you can find it on the World Wide
# Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.

# Todo:
#   - if a word has only one alphabetic part, don't check it twice.

# All spelling errors that have been observed "in the wild" in package
# descriptions are added here, on the grounds that if they occurred
# once they are more likely to occur again.

# Misspellings of "compatibility", "separate", and "similar" are 
# particularly common.

# Be careful with corrections that involve punctuation, since the check
# is a bit rough with punctuation.  For example, I had to delete the
# correction of "builtin" to "built-in".

%corrections = qw(
accesnt accent
accelleration acceleration
accessable accessible
accomodate accommodate
acess access
acording according
additionaly additionally
adress address
adresses addresses
adviced advised
albumns albums
alegorical allegorical
algorith algorithm
allpication application
altough although
alows allows
amoung among
amout amount
analysator analyzer
ang and
appropiate appropriate
arraival arrival
artifical artificial
artillary artillery
attemps attempts
automatize automate
automatized automated
automatizes automates
auxilliary auxiliary
availavility availability
availble available
avaliable available
availiable available
backgroud background
baloons balloons
becomming becoming
becuase because
cariage carriage
challanges challenges
changable changeable
charachters characters
charcter character
choosen chosen
colorfull colorful
comand command
commerical commercial
comminucation communication
commoditiy commodity
compability compatibility
compatability compatibility
compatable compatible
compatibiliy compatibility
compatibilty compatibility
compleatly completely
complient compliant
compres compress
containes contains
containts contains
contence contents
continous continuous
contraints constraints
convertor converter
convinient convenient
cryptocraphic cryptographic
deamon daemon
debians Debian's
decompres decompress
definate definite
definately definitely
dependancies dependencies
dependancy dependency
dependant dependent
developement development
developped developed
deveolpment development
devided divided
dictionnary dictionary
diplay display
disapeared disappeared
dissapears disappears
documentaion documentation
docuentation documentation
documantation documentation
dont don't
easilly easily
ecspecially especially
edditable editable
editting editing
eletronic electronic
enchanced enhanced
encorporating incorporating
enlightnment enlightenment
enterily entirely
enviroiment environment
environement environment
excellant excellent
exlcude exclude
exprimental experimental
extention extension
failuer failure
familar familiar
fatser faster
fetaures features
forse force
fortan fortran
framwork framework
fuction function
fuctions functions
functionnality functionality
functonality functionality
functionaly functionally
futhermore furthermore
generiously generously
grahical graphical
grahpical graphical
grapic graphic
guage gauge
halfs halves
heirarchically hierarchically
helpfull helpful
hierachy hierarchy
hierarchie hierarchy
howver however
implemantation implementation
incomming incoming
incompatabilities incompatibilities
indended intended
indendation indentation
independant independent
informatiom information
initalize initialize
inofficial unofficial
integreated integrated
integrety integrity
integrey integrity
intendet intended
interchangable interchangeable
intermittant intermittent
jave java
langage language
langauage language
langugage language
lauch launch
lesstiff lesstif
libaries libraries
licenceing licencing
loggin login
logile logfile
loggging logging
maintainance maintenance
maintainence maintenance
makeing making
managable manageable
manoeuvering maneuvering
ment meant
modulues modules
monochromo monochrome
multidimensionnal multidimensional
navagating navigating
nead need
neccesary necessary
neccessary necessary
necesary necessary
nescessary necessary
noticable noticeable
optionnal optional
orientied oriented
pacakge package
pachage package
packacge package
packege package
packge package
pakage package
particularily particularly
persistant persistent
plattform platform
ploting plotting
posible possible
powerfull powerful
prefered preferred
prefferably preferably
prepaired prepared
princliple principle
priorty priority
proccesors processors
proces process
processsing processing
processessing processing
progams programs
programers programmers
programm program
programms programs
promps prompts
pronnounced pronounced
prononciation pronunciation
pronouce pronounce
protcol protocol
protocoll protocol
recieve receive
recieved received
redircet redirect
regulamentations regulations
remoote remote
repectively respectively
replacments replacements
requiere require
runnning running
safly safely
savable saveable
searchs searches
separatly separately
seperate separate
seperately separately
seperatly separately
serveral several
setts sets
similiar similar
simliar similar
speach speech
standart standard
staically statically
staticly statically
succesful successful
succesfully successfully
suplied supplied
suport support
suppport support
supportin supporting
synchonized synchronized
syncronize synchronize
syncronizing synchronizing
syncronus synchronous
syste system
sythesis synthesis
taht that
throught through
useable usable
usefull useful
usera users
usetnet Usenet
utilites utilities
utillities utilities
utilties utilities
utiltity utility
utitlty utility
variantions variations
varient variant
verson version
vicefersa vice-versa
yur your
wheter whether
wierd weird
xwindows X
);
# The format above doesn't allow spaces
$corrections{'alot'} = 'a lot';

($#ARGV == 1) or fail("syntax: spelling <pkg> <type>");
$pkg = shift;
$type = shift;

# Read in entire files at one gulp.
undef $/;

# Check defined(), because for some reason <CPY> returns the undefined
# value if the file is length 0.

if (open(DESC, "fields/description")) {
    $description = <DESC>;
    close(DESC);
    spelling_check("spelling-error-in-description", $description)
	if defined($description);
}

if (open(CPY, "copyright")) {
    $copyright = <CPY>;
    close(CPY);
    spelling_check("spelling-error-in-copyright", $copyright)
	if defined($copyright);
}

if (open(RMD, "README.Debian")) {
    $readme = <RMD>;
    close(RMD);
    spelling_check("spelling-error-in-readme-debian", $readme)
	if defined($readme);
}

exit 0;

# -----------------------------------

sub fail {
    if ($_[0]) {
	print STDERR "internal error: $_[0]\n";
    } elsif ($!) {
	print STDERR "internal error: $!\n";
    } else {
	print STDERR "internal error.\n";
    }
    exit 1;
}

sub tag_error {
    my $tag = shift;
    if ($#_ >= 0) {
	# We can't have newlines in a tag message, so turn them into \n
	map { s,\n,\\n, } @_;
	my $args = join(' ', @_);
	print "E: $pkg $type: $tag $args\n";
    } else {
	print "E: $pkg $type: $tag\n";
    }
}

sub spelling_check {
    my $tag = shift;
    foreach $file (@_) {
	foreach $word (split(' ', $file)) {
	    $word = lc $word;
	    # Such "words" can contain punctuation, internal hyphens, etc.
	    # First try splitting it into alphabetic parts.
	    $count = 0;
	    foreach $part (split(/[^a-z]+/, $word)) {
		tag_error($tag, $part, $corrections{$part})
		    if (exists $corrections{$part});
		$count++ if ($part ne '');
	    }
	    next if ($count < 2);
	    # Then try deleting the non-alphabetic parts from the word.
	    # Treat apostrophes specially: only delete them if they occur
	    # at the beginning or end of the word.
	    $word =~ s/(^')|[^a-z']+|('$)//g;
	    if (exists $corrections{$word}) {
		tag_error($tag, $word, $corrections{$word});
	    }
	}
    }
}
