#!/usr/bin/perl use strict; use warnings; use LWP::Simple; # "use LWP::Simple" allows use of the "get()" function below to get # the page source of a url. # # Let me know if you have any questions or comments. # # Mark Gurling 2010 # # mgurling@berkeley.edu # ####### VARIABLES ####### print "Enter WormBase Gene Details url\n"; my $url_source = <> ; # This variable contains the url for which I want the page source. my @lines = url_prep($url_source); # This brings the array "@lines" out of the "url_prep" sub so that it can # be used later. print "cdna or genomic?\n" ; my $choice = <> ; my @raw_seq = chooser($choice, @lines); # Gets "@raw_seq" out of "choice". print "Choose colors?\n" ; my $color_choice = <> ; my @colors = choose_colors($color_choice) ; my $file_name = extract_title(@raw_seq); # This stores the gene name from "@raw_seq" in "$file_name". my @raw_color_tags = get_clean_array(@raw_seq); # Gets cleaned up sequence that contains only the # color tags. ####### REASSEMBLING SEQUENCE AND CREATING REFERENCES ####### my %clean_hash = make_clean_hash(@raw_color_tags); # Makes a clean reference for the array so that colors can # be matched to sequence. my @clean_seq = make_clean_seq(@raw_color_tags); # Removes any html from the sequence. my $string_seq = join("",@clean_seq); # Joins the sequence array into one string to make another array. ####### PRINTING TO FILE ####### open (APEFILE, ">$file_name.ape") ; print_header($string_seq) ; close (APEFILE) ; open (APEFILE, ">>$file_name.ape") ; print_until_features() ; print_features(@clean_seq) ; print_bottom_sequence($string_seq) ; close (APEFILE) ; ####### SUBROUTINES ####### sub url_prep { my @url_line = @_; # This puts the url in a new variable that will be used within the sub. my $content = get($url_line[0]) or die 'Unable to get page'; # "$content" now contains the page source from the above url. # If get() didn't work then you get the error message. my @tmp_array = split(/\n/, $content); # This makes each line in "$content" an element in the array. return @tmp_array; # Outputs array so that I can put it in another array outside of # this sub. } sub line_counter { my $counter = 0; # The counter starts at 0 so that if the regex is found in the first # line then this sub will return 0, which is the index in "@lines" for # the element containing the first line of the url. my @array_and_term = @_; my @search_term = $array_and_term[$#array_and_term]; pop(@array_and_term); my @array = @array_and_term; # The input for the sub is passed to this array so that it can be # used later. for my $i (@array) { # This loop will put each line of the url--now found in "@lines" as separate # elements--one at a time into the variable $i and then send it through # the procedure below. unless ($i =~ /$search_term[0]/) { # This looks for the search term that we input # to the sub. $counter++; # if the search term isn't found on the line, the # counter is incremented by one. } else { last; # This ends the loop once the search term is found so # that the counter doesn't keep incrementing as the loop # goes through the remaining line of the page source. } } return $counter; # Outputs the number on the counter. # This is the number of the first line in the page source that # contains the search term. } sub raw_spliced_seq { my @tmp_lines = @_; # Stores the input in a temporary array. my $left_bound = line_counter(@tmp_lines,"spliced"); # Uses the line number where "spliced" was found as the starting line for # the array. my $right_bound = line_counter(@tmp_lines,"unspliced"); # Uses the line number where "unspliced was found as the last line for # the array. $right_bound--; # Subtracts 1 from the right bound so that the title of the # unspliced sequence is not included. my @raw_seq = @tmp_lines[$left_bound..$right_bound]; # Stores the appropriate lines in the array. return @raw_seq; # This sub will return the array which # can be stored in another array for later use. } sub raw_unspliced_seq { # See "raw_spliced_seq" comments for comparison. my @tmp_lines = @_; my $left_bound = line_counter(@tmp_lines,"unspliced"); my $right_bound = line_counter(@tmp_lines,"feedback"); $right_bound--; my @raw_seq = @tmp_lines[$left_bound..$right_bound]; return @raw_seq; } sub extract_title { my @tmp_array = @_; # Puts the raw sequence in a temporary array. my $first_line = $tmp_array[0]; # Gets the first line--the raw title line--from the temporary array. my @clipped_line = split />/, $first_line; # Splits the first line into array elements. # The last element will contain the actual title. return $clipped_line[$#clipped_line]; # returns the title (i.e. the last line of the array). } sub remove_spans { my @tmp_array = @_; my $tmp_string = join("",@tmp_array); # Puts the array pieces together again and stores them in the string. my @raw_spans_removed = split(/<\/?span>?/,$tmp_string); # Splits "$string" up by removing the spans and puts the pieces in the array. return @raw_spans_removed; } sub remove_pre_tag { my @tmp_array = @_; $tmp_array[$#tmp_array] =~ s/<\/pre>//; # Removes the tag from the ends of the sequences. my @raw_color_tags_and_spaces = @tmp_array; return @raw_color_tags_and_spaces; } sub remove_empty_elements { my @tmp_array = @_; my @raw_color_tags = grep /\S/, @tmp_array; # Makes an array--"@raw_color_tags"--by extracting from "@tmp_array" the # elements that containing "non-white-space" (i.e. \S). return @raw_color_tags; } sub make_prehash_array { my @tmp_array = @_; my @export_array = (); for my $var (@tmp_array) { push(@export_array,$var); push(@export_array,"0"); } return @export_array; } sub hash_count { my @tmp_array = @_; my %export_hash = make_prehash_array(@tmp_array); # Makes an array that will make a hash without changing "@raw_color_tags". for my $var (keys %export_hash) { if ($var =~ /FFFF00/) { $export_hash{$var}+=1 # Codes the FFFF00 color as 1. } if ($var =~ /FFA500/) { $export_hash{$var}+=2 # Codes the FFA500 color as 2. } # If no color information is there the value remains 0. } return %export_hash; } sub make_clean_hash { my @tmp_array = @_; my %counted = hash_count(@tmp_array); my %export_hash; # Hash is declared. @export_hash { map { s/.*style="Background\-color: #[FA50]{6}">//; $_ } keys %counted } = values %counted; # ??? I don't quite get it, but it works. It changes the keys of "%counted" and maps them to # a new hash and assigns back the values found in the old hash. return %export_hash; } sub make_clean_seq { my @tmp_array = @_ ; my @export_seq = map { s/.*style="Background\-color: #[FA50]{6}">//; $_ } @tmp_array; # takes each element of the temp array, removes the html tag at the front, then # maps it to the new export array. return @export_seq; } sub get_clean_array { my @tmp_array = @_ ; shift(@tmp_array); # Removes the first element (i.e. the title line). my @raw_spans_removed = remove_spans(@tmp_array); # Removes span tags and save the output in a new array. my @raw_color_tags_and_spaces = remove_pre_tag(@raw_spans_removed); # Removes the tag from the last line of the page source. my @export_array = remove_empty_elements(@raw_color_tags_and_spaces); # Makes a new array containing only the elements of the previous array that weren't empty. return @export_array; } sub print_bottom_sequence { my @tmp_array = @_ ; my $tmp_string = $tmp_array[0]; my @pre_print_seq = $tmp_string =~ /[atgc]{1,10}/ig; # Breaks the string into an array. Each element contains 10 bp. my $ten_string = join(' ',@pre_print_seq); # Rejoins the sequence as 10bp pieces separated by spaces. my @array_for_print = $ten_string =~ /[atgc]{1,10}\s?[atgc]{1,10}\s?[atgc]{1,10}\s?[atgc]{1,10}\s?[atgc]{1,10}\s?[atgc]{1,10}\s?/ig; # Finds a group of 60bp and puts the string in the array as # an element. I'm sure there is a better way to write that # regular expression. my $line_number = 1; print APEFILE "ORIGIN\n"; for my $fumes (@array_for_print) { printf APEFILE "%9d ", $line_number; print APEFILE "$fumes\n"; $line_number+=60; } print APEFILE '//'; } sub header_date { my @time = localtime(time); # Gets weird time array. my @month_brev = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC); my $year = $time[5]; $year+=1900; # Gets year from the weird time array and formats it. my $day = sprintf("%02d", $time[3] ); # Stores the formatted day in a variable. my $export_date = "$day-$month_brev[$time[4]]-$year"; return $export_date; } sub print_header { my @tmp_array = @_ ; my $tmp_string = $tmp_array[0] ; my $seq_length = length($tmp_string); $seq_length = sprintf("%19d", $seq_length); # Formats the DNA size to right justify and allows for # 19 digits. my $header_date = header_date(); print APEFILE "LOCUS$seq_length bp ds-DNA linear $header_date\n"; } sub print_until_features { print APEFILE "DEFINITION\nACCESSION\nVERSION\nSOURCE\n ORGANISM\nCOMMENT\n"; print APEFILE "COMMENT ApEinfo:methylated:1\n"; print APEFILE "FEATURES Location/Qualifiers\n"; } sub get_positions { my @tmp_array = @_ ; if ( $tmp_array[0] =~ /$tmp_array[1]/ ) { my $start = $-[0] ; my $end = $+[0] ; $start++; my @export_array = ($start, $end); return @export_array; } } sub print_features { my @tmp_array = @_ ; my $name = "exon" ; my $counter = 1 ; for my $ft_cycler ( @tmp_array ) { unless ( $clean_hash{$ft_cycler} == 0 ) { my @poses = get_positions($string_seq, $ft_cycler); print APEFILE " misc_feature "; print APEFILE "$poses[0]..$poses[1]\n"; print APEFILE " " ; print APEFILE "/ApEinfo_label=$name $counter\n" ; $counter++; if ($clean_hash{$ft_cycler} == 1 ) { print APEFILE " " ; print APEFILE "/ApEinfo_fwdcolor=$colors[0]\n" ; print APEFILE " " ; print APEFILE "/ApEinfo_revcolor=$colors[2]\n" ; } if ($clean_hash{$ft_cycler} == 2 ) { print APEFILE " " ; print APEFILE "/ApEinfo_fwdcolor=$colors[1]\n" ; print APEFILE " " ; print APEFILE "/ApEinfo_revcolor=$colors[3]\n" ; } } } } sub chooser { my @tmp_array = @_ ; my @export_array ; my $choice = $tmp_array[0] ; shift(@tmp_array) ; chomp($choice); if ( $choice eq "cdna" ) { @export_array = raw_spliced_seq(@tmp_array); # This cuts the desired sequence out of "@lines" and puts # it into @raw_seq. "raw_spliced_seq()" or "raw_unspliced_seq" is used here. } elsif ( $choice eq "genomic" ) { @export_array = raw_unspliced_seq(@tmp_array); # This cuts the desired sequence out of "@lines" and puts # it into @raw_seq. "raw_spliced_seq()" or "raw_unspliced_seq" is used here. } else { print "Not a valid choice."; die (""); } return @export_array; } sub choose_colors { my @tmp_array = @_ ; my $tmp_string = $tmp_array[0] ; my $first_fcolor ; my $second_fcolor ; my $first_rcolor ; my $second_rcolor ; if ( $tmp_string =~ /n/i ) { $first_fcolor = "#ffff00" ; $second_fcolor = "#ffa500" ; $first_rcolor = "green" ; $second_rcolor = "blue" ; } elsif ( $tmp_string =~ /y/i ) { print "Enter color names or hex code (e.g. #ff0080)\n\n" ; print "Forward color 1\n" ; $first_fcolor = <> ; chomp($first_fcolor) ; print "Forward color 2\n" ; $second_fcolor = <> ; chomp($second_fcolor) ; print "Reverse color 1\n" ; $first_rcolor = <> ; chomp($first_rcolor) ; print "Reverse color 2\n" ; $second_rcolor = <> ; chomp($second_rcolor) ; } return ($first_fcolor, $second_fcolor, $first_rcolor, $second_rcolor) ; } sub print_array { for my $i (@_) { print "$i \n"; } }