default: redo-recent echo "Do 'make remake-distances' when the jobs are all done." SCOP = /projects/compbio/data/scop/dir.cla.scop.txt.gz # set the "REDOSCOP macro to indicate whether to pretend that the SCOP # file is new when doing a remake-all. # REDOSCOP = REDOSCOP = -W ${SCOP} PCB = /projects/compbio/bin PCBS = ${PCB}/scripts ifndef UNAME-M UNAME-M = $(shell uname -m) endif ifeq (${UNAME-M}, sun4) UNAME-M = sun4m endif # The following is really a bit of a bug, since we'd *like* to use x86_64 # executables if they exist, but we have almost none installed. ifeq (${UNAME-M},x86_64) UNAME-M = i686 endif ifeq ($(UNAME-M), i386-linux) UNAME-M = $(shell uname -m) endif PCB-MACH = ${PCB}/${UNAME-M} PCEM = /projects/compbio/experiments/models.97 PCEM-SCRIPTS = ${PCEM}/scripts YEAST:=/projects/compbio/experiments/protein-predict/yeast ifndef CLUSTER CLUSTER:= moai #CLUSTER:= farm endif PARA= ${PCEM-SCRIPTS}/para-trickle-make -cluster ${CLUSTER} -modelsdir ${YEAST} \ -makefile Makefile -subdir_length 4 -manyids -se2log SERIAL= ${PCEM-SCRIPTS}/serial-make -modelsdir ${YEAST} \ -makefile Makefile -subdir_length 4 %.remake-log: %.ids ${PARA} -targets 'REDO_SEARCHES=1' < $^ &> $@ %.redo-all-log: %.ids ${PARA} -targets 'REDO_ALL=1' < $^ &> $@ %.make-log: %.ids ${PARA} < $^ &> $@ %.serial-log: %.ids ${SERIAL} default < $^ &> $@ remake-all: id-subsets/used-adpstyle1.ids orf_trans.ids remove-ids $< < orf_trans.ids \ | ${PARA} -targets '${REDOSCOP}' ${PARA} -targets '${REDOSCOP} ALIGN_VITERBI=1 ADPSTYLE=1' < $< setup: orf_trans.ids orf_trans.pin orf_trans.ids: orf_trans ids-from-fasta < $^ > $@ id-subsets/kumar.ids: kumar.seqs ids-from-fasta < $^ > $@ prefix.counts: orf_trans.ids scripts/count-prefixes < $^ > $@ all-directories: orf_trans.ids scripts/distribute-to-dirs < $^ # setup indexes for NCBI blast of yeast orfs %.phr %.pin %.psd %.psi %.psq: % ${PCB-MACH}/formatdb -i $* -l formatdb.log -p T -o T PDBFINDER-DIR = /projects/compbio/data/pdbfinder PDBFINDER = ${PDBFINDER-DIR}/pdbfinder.dat.gz # PDB-SEQS= /projects/compbio/data/pdb/all-protein PDB-SEQS= /projects/compbio/data/pdb/dunbrack-pdbaa # Update the definition of RECENT before making redo-recent. # This is still important, unless we filter out the targets that were # recently redone, because a given template may not score well enough # to be retained in the best-scores.rdb file, and would keep re-triggering # predictions, even though not much has changed. include define-recent.make redo-recent: update-pdbfinder \ recent-pdb.ids recent-templates.ids \ redo-recent.ids \ redo-adpstyle1.ids redo-adpstyle5.ids \ redo-adpstyle1.log redo-adpstyle5.log update-pdbfinder: cd ${PDBFINDER-DIR}; make -k # set USE_ALL_PDB_MINUS_OLD # to find new sequences by taking the current set and removing ones # that pdbfinder reports as old. # USE_ALL_PDB_MINUS_OLD = 1 # Find out what sequences have been added to PDB recently # BUG: PDBFINDER reports Deposition date, not Release date! ifdef USE_ALL_PDB_MINUS_OLD # figure out the new pdb sequences by subtracting the old ones # from the complete set. old-pdb.ids: ${PDBFINDER} gunzip -c ${PDBFINDER} \ | ${PDBFINDER-DIR}/find-recent-ids \ -before ${RECENT} -after 1950-01-01 -minaa 0 \ > $@ recent-pdb.ids: ${PDB-SEQS} old-pdb.ids ids-from-fasta < ${PDB-SEQS} \ | remove-ids old-pdb.ids \ > $@ else # trust PDBFINDER to be up-to-date enough, and just extract new ids # from it recent-pdb.ids: ${PDBFINDER} define-recent.make date gunzip -c ${PDBFINDER} \ | ${PDBFINDER-DIR}/find-recent-ids \ -after ${RECENT} -minaa 15 \ > $@ endif recent-templates.ids: define-recent.make ${PCEM}/indexes/t2k.ids date scripts/find-recent-templates -after ${RECENT} \ < ${PCEM}/indexes/t2k.ids >$@ recent.ids: recent-pdb.ids recent-templates.ids date cat $^ > $@ # Get the sequences in FASTA format and remove duplications. # I'd like to use the following command # ${PCB-MACH}/fastacmd -d ${PDB-SEQS} -i $^ > tmp-pdb.seq # ${PCB-MACH}/uniqueseq recent -db tmp-pdb.seq # rm tmp-pdb.seq # but fastacmd does stupid name-mangling (adding "lcl|" to # the beginning of each id). # recent.seq: recent.ids date ${PCBS}/extract-from-fasta $^ < ${PDB-SEQS} > $@ # What cutoff should be used in deciding to include a sequence to redo? ifndef BLAST_E_VALUE BLAST_E_VALUE := 0.01 endif # Find any Yeast ORFS that are similar to the new PDB sequences using blastp # This is the old version, when -m 9 was not being used for blastall. #recent.blast: orf_trans.pin recent.seq # ${PCB-MACH}/blastall -p blastp -d orf_trans -i recent.seq \ # -e ${BLAST_E_VALUE} -o $@ recent.blast: orf_trans.pin recent.seq ${PCB-MACH}/blastall -p blastp -d orf_trans -i recent.seq \ -e ${BLAST_E_VALUE} -o $@ -m 9 -F F # This is a blast of all pdb sequences against all orf sequences. It is used # in worse-than-expected-hmms t2k-x-seqs.blast: orf_trans.pin ${PCEM}/indexes/t2k.x-seqs ${PCB-MACH}/blastall -p blastp -d orf_trans -i ${PCEM}/indexes/t2k.x-seqs \ -e ${BLAST_E_VALUE} -o $@ -m 9 -F F # Which yeast ORFs should be redone based on the recent additions to PDB. # Previously we were putting all of the recent IDs into redo-recent.ids redo-recent.ids targets-skipped.ids: orf_trans.ids recent.blast scripts/get-ids-to-redo -mult 1.e-07 -exp 0.4 -suppress < $< # A way to see if some specific targets can be updated between monthly updates # This would show those IDs that might be worthwile to update individually worse-than-expected-hmms.ids worse-hmms-skippped.ids: orf_trans.ids t2k-x-seqs.blast scripts/get-ids-to-redo -blast t2k-x-seqs.blast -exp 0.25 -mult 10 \ -redo worse-than-expected-hmms.ids -skip worse-hmms-skippped.ids \ -nosuppress \ < $< # This target will plot all points that will and will not get updated along with # the threshold from get-ids-to-redo in order visually debug plot-recent: Lowest_BLAST_HMM_template_target.ps Lowest_BLAST_HMM_template_target.ps: all-lowest.ids gnuplot scripts/plotLowestTargetTemplatePairs.gnuplot all-lowest.ids: redo-recent.ids targets-skipped.ids all-lowest > $@ # Which of the yeast ORFs to be redone need to use adpstyle 1 to avoid # hmmscore crashes? This may not matter if you are just redoing the fold-recognition # search, but does matter if the t2k multiple alignment is to be redone. redo-adpstyle1.ids: redo-recent.ids id-subsets/used-adpstyle1.ids ${PCEM-SCRIPTS}/keep-ids id-subsets/used-adpstyle1.ids < redo-recent.ids > $@ # Which yeast ORFs can be redone safely with the default parameters. redo-adpstyle5.ids: redo-recent.ids id-subsets/used-adpstyle1.ids ${PCEM-SCRIPTS}/remove-ids id-subsets/used-adpstyle1.ids < redo-recent.ids > $@ redo-adpstyle1.log: redo-adpstyle1.ids ${PARA} -targets 'REDO-SEARCHES=1 BLAST_MAX=5000 ADPSTYLE=1' < $^ &> $@ redo-adpstyle5.log: redo-adpstyle5.ids ${PARA} -targets 'REDO-SEARCHES=1' < $^ &> $@ # Problems with current method for figuring out which ORFs need to have # the fold-recognition search repeated: # The list of recent ids is based on PDB release dates, but it # takes up to a week for the releases to be represented in the PDBAA # list and our template library, so redoing the search may not # get the recent hits! # # There is no check made to see if the existing search for # an ORF already found the "new" PDB file. This would require # extracting the pairs of ids from recent.blast, then checking the # best-hits list for the yeast id---not a big problem, but a # little more complicated than the current blast2-ids script. remake-distances: make -k -W orf_trans.ids \ protein-protein.rdb.gz protein-protein-sorted.rdb.gz \ diagonal.rdb kumar-diagonal.rdb %.rdb.gz: %.rdb gzip -9f $^ protein-protein.rdb.gz: orf_trans.ids scripts/make-protein-protein-table < $^ > protein-protein.rdb gzip -9f protein-protein.rdb protein-protein-sorted.rdb: protein-protein.rdb.gz gunzip -c $^ \ | ${PCB}/sorttbl E_value > $@ diagonal.rdb: protein-protein-sorted.rdb.gz echo "prot E_value" > $@ gunzip -c $^ \ | awk '$$1 == $$2 {print $$1 "\t" $$3}' >> $@ kumar-diagonal.rdb: diagonal.rdb echo "prot E_value" > $@ echo "10S 10N" >> $@ ${PCEM-SCRIPTS}/keep-ids id-subsets/kumar.ids < $^ >> $@ old-vs-new-diagonal: old-diagonal.rdb new-diagonal.rdb scripts/old-vs-new.pl \ -old old-diagonal.rdb \ -new new-diagonal.rdb \ > $@ # BUGGY WHEN LIST OF SEQUENCES CHANGES! # sort old-diagonal.rdb > tmp1 # sort new-diagonal.rdb > tmp2 # diff tmp1 tmp2 \ # | grep '^[<>]' \ # | sort --key=2,2 --key=1,1 \ # | sed '/ $@ # rm tmp1 tmp2 big-changes-%: old-vs-new-diagonal-% awk '($$3 > 1.e10 * $$6) || ($$3 ~/--/ && $$6 < 1.e-05) {print}' < $< \ | sort -gr +2 \ > $@ orf_trans-length.rdb: orf_trans ${PCEM-SCRIPTS}/make-length-rdb < $^ > $@ ############################################ # Targets for remaking part of the library ############################################ for-fanhsu: ${PARA} -targets 'w0.5_logo dssp-ehl2_logo jpeg' < orf_trans.ids %.make-log: %.ids ${PARA} < $^ > $@ %.jpeg-log: %.ids ${PARA} -targets 'jpeg' < $^ > $@