#!/usr/bin/perl # convert MDL mol file to structure search key # this routine does not use the bond order from the mol file # to deterimine the canonical ordering. Instead it uses the # sum of the atomic numbers of the connected atoms to help # resolve ambiguities. Also H is included, so that distinctions # can be made on radicals and cations where the H abstraction # position differs. Explicit hydrogens also allow for # bridging and explicit H-bonds. # The output is the number of atoms, the sum of the Z's, # the list of elements in canonical order, and the condensed # connection matrix in the same order as the list of elements. # This current version is designed for webmo, so it doesn't # need to handle more than 8 atoms.... sorry # # Thomas W. Shattuck, Dept. of Chemistry, Colby College, # Waterville, ME 04901, twshattu@colby.edu. July 5, 2000. # # # this list gives the allowed elements $nsym = 36 ; @sym = ( 'H','He','Li','Be','B','C','N','O','F','Ne','Na','Mg','Al','Si', 'P','S','Cl','Ar','K','Ca','Sc','Ti','V','Cr','Mn', 'Fe','Co','Ni','Cu','Zn','Ga','Ge','As','Se','Br') ; @eZ = (1,2,3,4,5,6,7,8,9,10,11,12,13,14, 15,16,17,18,19,20,21,22,23,24,25, 26,27,28,29,30,31,32,33,34,35 ) ; # read the current directory # open each file and extract formula, charge, and multiplicity #$file = "allyl-.mol" ; #$file = "CH3AsH2.mol" ; #$file = "BF3.mol" ; opendir(DIR,'.') || die "Can't open directory\n" ; local(@filenames) = readdir(DIR) ; closedir(DIR) ; for (@filenames) { next unless /mol$/ ; $file = $_ ; open(MOL,$file) || die "Can't open mol file $file" ; # get cartesian coordinates $_ = ; # Snip off the -ISIS- line. $_ = ; while ( ) { last unless /^\n/ } ; ($iatm,$ibnd,$x) = split ; die "Zero atoms in file $file\n" if $iatm < 1 ; # # initialize connection matrix # no number after name - active matrix # 0 after name - orginial matrix # 1 after name - old matrix from previous iteration $ncnt1 = 0 ; for ($j=0 ; $j<$iatm ; $j++ ) { $ixcnct[$j] = 0 ; $sumZ[$j] = 0 ; for ($k=0 ; $k<$iatm ; $k++ ) { $i = $j*$iatm+$k ; $conct[$i]=0 ; } } $j = -1 ; while () { $j++ ; ($x[$j],$y[$j],$z[$j],$el[$j],$x) = split ; # find Z for ($isym=0; $isym<$nsym; $isym++) { if (length($el[$j]) == 2 ) { substr($el[$j],1,1) =~ tr/A-Z/a-z/ } if ( $el[$j] eq $sym[$isym] ) { $iZ[$j] = $eZ[$isym] ; goto gotsym } } die "Unknown element symbol ($el[$j]) in file $file\n" ; gotsym: last if $j == $iatm-1 ; } # end while coordinates $j = -1 ; # die "1 $iZ[0] $el[0] 0 $file\n" if $iatm < 2 ; # now connection table while () { $j++ ; ($a1[$j],$a2[$j],$bo[$j],$x) = split ; $a1[$j]-- ; $a2[$j]-- ; $conct[$a1[$j]*$iatm+$a2[$j]]=1 ; $ixcnct[$a1[$j]]++ ; $sumZ[$a1[$j]] += $iZ[$a2[$j]] ; $conct[$a2[$j]*$iatm+$a1[$j]]=1 ; $ixcnct[$a2[$j]]++ ; $sumZ[$a2[$j]] += $iZ[$a1[$j]] ; last if $j == $ibnd-1 ; } # end while bonds # now charge and multiplicity $chargem = 0 ; $multm = 1 ; while () { if (/CHARGE/) { $chargem = ; chop($chargem) ;} if (/MULTIPLICITY/) { $multm = ; chop($multm) } } # end while charge and multiplicity # # store the original connection matrix for ($j=0 ; $j<$iatm ; $j++ ) { for ($k=0 ; $k<$iatm ; $k++ ) { $i = $j*$iatm+$k ; $conct0[$i] = $conct[$i] ; } } # ****** main loop ******* for ( $loop=0 ; $loop<6 ; $loop++ ) { # print out connection matrix # for ($j=0 ; $j<$iatm ; $j++ ) { # for ($k=0 ; $k<$iatm ; $k++ ) { # $i = $j*$iatm+$k ; print $conct[$i] ; # } # print " $el[$j]($iZ[$j])/$sumZ[$j]/:$ixcnct[$j] \n" ; # } # find number of unique connection numbers $ncnt = 1 ; $iunique[0] = $ixcnct[0] ; for ($j=1 ; $j<$iatm ; $j++ ) { for ($icnt=0 ; $icnt<$ncnt ; $icnt++) { if ( $ixcnct[$j] == $iunique[$icnt] ) { last } } if ($icnt == $ncnt) { $iunique[$ncnt] = $ixcnct[$j] ; $ncnt++ } } #print "number of unique connection numbers= $ncnt-1 \n" ; # for ($icnt=0 ; $icnt<$ncnt ; $icnt++) { # print " $iunique[$icnt] " ; } #print "\n" ; # loop stop criterion if ( $ncnt <= $ncnt1 ) { last } # store the old connection matrix $ncnt1 = $ncnt ; for ($j=0 ; $j<$iatm ; $j++ ) { $ixcnct1[$j] = $ixcnct[$j] ; for ($k=0 ; $k<$iatm ; $k++ ) { $i = $j*$iatm+$k ; $conct1[$i] = $conct[$i] ; } } # multiply the active by the orginal connection matrix for ($j=0 ; $j<$iatm ; $j++ ) { $ixcnct[$j] = 0 ; for ($k=0 ; $k<$iatm ; $k++ ) { $i = $j*$iatm+$k ; $conct[$i] = 0 ; for ($m=0 ; $m<$iatm ; $m++ ) { $jm = $j*$iatm+$m ; $mk = $m*$iatm+$k ; $conct[$i] += $conct1[$jm]*$conct0[$mk] ; } # get the new connection numbers $ixcnct[$j] += $conct[$i] ; } } } ; # **** end main loop **** # # now get canonical ordering # just do a bubble sort based on connection number, atomic number, # and sum of atomic number of attached atoms for ($j=0 ; $j<$iatm ; $j++ ) { $ord[$j]=$j ; $ixcnct[$j]=$ixcnct1[$j] } $move = 1 ; while ( $move ) { $move = 0 ; for ($j=0 ; $j<$iatm-1 ; $j++ ) { if ( $ixcnct[$j]<$ixcnct[$j+1] ) { $move = 1 ; last } if ( $ixcnct[$j] != $ixcnct[$j+1] ) { next } if ( $iZ[$j]>$iZ[$j+1] ) { $move = 1 ; last } if ( $iZ[$j] != $iZ[$j+1] ) { next } if ( $sumZ[$j]<$sumZ[$j+1] ) { $move = 1 ; last } } ; # end bubble sort for if ( $move) { # exchange atoms $temp = $ord[$j] ; $ord[$j]=$ord[$j+1] ; $ord[$j+1]=$temp ; $temp = $ixcnct[$j] ; $ixcnct[$j]=$ixcnct[$j+1] ; $ixcnct[$j+1]=$temp ; $temp = $iZ[$j] ; $iZ[$j]=$iZ[$j+1] ; $iZ[$j+1]=$temp ; $temp = $sumZ[$j] ; $sumZ[$j]=$sumZ[$j+1] ; $sumZ[$j+1]=$temp ; } ; # end if move } ; # end bubble sort while # build canonical connection matrix $conel = '' ; $totZ = 0 ; for ($j=0 ; $j<$iatm ; $j++ ) { $conel .= $el[$ord[$j]] ; $totZ += $iZ[$j] ; for ($k=0 ; $k<$iatm ; $k++ ) { $jm = $ord[$j]*$iatm+$ord[$k] ; $i = $j*$iatm+$k ; $cct[$i] = $conct0[$jm] ; } } # print out canonical connection matrix #print "***canonical ordering*** \n" ; #print "..........01234567890123456 \n" ; # for ($j=0 ; $j<$iatm ; $j++ ) { # print "($j)old $ord[$j]::" ; # for ($k=0 ; $k<$iatm ; $k++ ) { # $i = $j*$iatm+$k ; print $cct[$i] ; # } # print " $el[$ord[$j]]($iZ[$j])/$sumZ[$j]/:$ixcnct[$j] \n" ; # } # construct keys $c = 0 ; if ( $iatm < 9 ) { # get condensed connection matrix $ipow2 = 1 ; for ($j=0 ; $j<$iatm ; $j++ ) { # check first to see if row empty for ($k=$j+1 ; $k<$iatm ; $k++ ) { $i = $j*$iatm+$k ; if ( $cct[$i]==1 ) { last } } ; # end for check row # entry for this row if ( $k == $iatm ) { $ipow2 *= 2 ; } else { if ( $j != 0 ) { $c += $ipow2 ; $ipow2 *= 2 } for ($k=$j+1 ; $k<$iatm ; $k++ ) { $i = $j*$iatm+$k ; if ( $cct[$i] == 1 ) { $c += $ipow2 } $ipow2 *= 2 } ; # end for row } ; # end if else nonempty row } ; # end for each row } ; # end if <9 atoms # # now get charge, multiplicity and header from html file $file =~ s/mol/html/ ; open(HTM,$file) || die "Can't open html file $file" ; $charge = 0 ; $multiplicity = 1 ; $chg = '' ; $mult = '' ; while () { last if /Tell/ ; if (/BODY/) { s/// ; s/

// ; s/<\/H2>// ; chop ; $head = $_ ; } if (/charge/) { s/The ion charge is // ; ($chg,$tail) = split(/\./) ; } $charge = $chg if $chg ; if (/multiplicity/) { s/The multiplicity is // ; ($mult,$tail) = split(/\./) if $charge == 0 ; ($stuff,$mult,$tail) = split(/\./) if $charge != 0 ; } } $multiplicity = $mult if $mult ; print "$iatm $charge $multiplicity $totZ $conel $c $file\n" ; }