#!/pkg/bin/perl -w

# $Id: parse.pl,v 1.17 1997/02/01 20:13:48 hoagland Exp $


# This file contains general operations to help in parsing of text.

sub split_outside_quotes {
  my($splitchar,$text,$qchar)= @_;
  $qchar= '"' unless defined($qchar); # double quote is default
  my(@sparts)= split($splitchar,$text);
  my(@parts)= ();
  my $in_quote=0;
  while (defined($_=shift(@sparts))) {
    # not in a quote as of last piece
    $in_quote= !$in_quote while (m/\Q$qchar\E/g);
    if ($in_quote) { # an odd # of quotes
      # cat together till find another piece with an odd # of quotes
      my $part=$_;
      while (defined($_=shift(@sparts))) {
	# in a quote as of last piece
	$in_quote= !$in_quote while (m/\Q$qchar\E/g);
	$part.= $splitchar.$_;
	last if !$in_quote;
      }
      push(@parts,$part);
    } else {
      push(@parts,$_);
    }
  }
  return @parts;
}


########

# The following functions encode quoted parts of some text as an aid
# to parsing (i.e. to hide certain quoted characters from a "split").
# This is done by setting the first bit of characters inside the
# quote, including the quote itself.  It is important to note that
# these functions will _not_ quote things that have already been
# quoted.  Thus if you have the text a"b\"c"d, encoding it first with
# a backslash will yield a"b**c"d (where *'s represent shifted chars).
# Trying to encode a"b**c"d by a pair of double quotes will have no
# affect.  On the other hand if you encoded the double quotes first
# you will get a***c"d.  In other words, these functions are only
# useful in certain quoting/encoding situations.

# encode_quote_single encodes the given text as above by the
# characters given.  The characters are processed in the order given
# and are interpreted as characters which quote just the next
# character.
#
# encode_quote_single: text {quote_chars} -> text
#
# e.g. $text= &encode_quote_single($text,'\\');
#
sub encode_quote_single {
  $_= shift;
  my($upqchar,$qchar);
  foreach $qchar (@_) {
    $upqchar= chr(ord($qchar)+128);
    s/\Q$qchar\E([^\x80-\xff])/$upqchar.chr(ord($1)+128)/eg;
  }
  return $_;
}

# encode_quote_range encodes the given text as above by the characters
# given.  The characters are processed in the order given and are
# interpreted as characters which quote all characters between an
# adjacent pair of the quoting characters.
#
# encode_quote_range: text {quote_chars} -> text
#
# e.g. $text= &encode_quote_range($text,'"');
#
sub encode_quote_range {
  $_= shift;
  my($upqchar,$qchar);
  foreach $qchar (@_) {
    $upqchar= chr(ord($qchar)+128);
    s/\Q$qchar\E([^\x80-\xff]*?)\Q$qchar\E/$upqchar.join('',map(chr(ord($_)+128),split('',$1))).$upqchar/eg;
#    s/\Q$qchar\E([^\x80-\xff]*)\Q$qchar\E/$upqchar.($t= $1, $t=~ tr\/\\x80-\\xff\/\\x00-\\x7f\/,$t).$upqchar/eg;
  }
  return $_;
}

# rest_quotes unencodes all the encoding done by the
# encode_quote_{single,range}, in essence, restoring the quoted text.
#
# rest_quotes: text -> text
#
# e.g. $text= &rest_quotes($text);
#
sub rest_quotes {
  $_= shift;
  tr/\x80-\xff/\x00-\x7f/;
  return $_;
}

# rest_without_quote_single restores the quoting done by the the given
# characters, but removes the quote character in the process.  The
# characters are processed in the order given and are interpreted as
# characters which quote just the next character.  For example, if the
# text before being processed to encode_quote_single with \ is a\bc,
# the text after a call to rest_without_quote_single with \ will be
# abc.  Note: if the calls to the rest_without_quote* routines do not
# contain the characters in the reverse order encoded, you may not get
# the result you expect.
#
# rest_without_quote_single: text {quote_chars} -> text
#
# e.g. $text= &rest_without_quote_single($text,'\\');
#
sub rest_without_quote_single {
  $_= ' '.shift;
  my($upqchar,$qchar);
  foreach $qchar (@_) {
    $upqchar= chr(ord($qchar)+128);
    s/\Q$upqchar\E([\x80-\xff])/chr(ord($1)-128)/eg;
  }
  s/^ //;
  return $_;
}

# rest_without_quote_range restores the quoting done by the the given
# characters, but removes the quote characters in the process.  The
# characters are processed in the order given and are interpreted as
# characters which quote all the characters between adjacent pairs of
# the quoting characters.  For example, if the text before being
# processed to encode_quote_range with " is a"b"c, the text after a
# call to rest_without_quote_range with will be abc.  Note: if the
# calls to the rest_without_quote* routines do not contain the
# characters in the reverse order encoded, you may not get the result
# you expect.
#
# rest_without_quote_range: text {quote_chars} -> text
#
# e.g. $text= &rest_without_quote_range($text,'"');
#
sub rest_without_quote_range {
  $_= shift;
  my($upqchar,$qchar);
  foreach $qchar (@_) {
    $upqchar= chr(ord($qchar)+128);
    s/\Q$upqchar\E([\x80-\xff]*)\Q$upqchar\E/join('',map(chr(ord($_)-128),split('',$1)))/eg;
  }
  return $_;
}


#################

# encode_quotes takes backquotes and double quotes and reduces them to
# some non-printable characters surrounding a number, which is an
# index into a mapping.  The resulting text and the mapping are
# returned.  This is done so that characters such as "]" or ";" inside
# of quotes don't interfere in the parsing of those characters in their
# special roles. If a second argument is given to the function, then
# that is used as the quote character instead of double quotes.  This
# fact is stored in the mapping.  This function may be called
# successively over the same original string, but it is important to
# apply the resulting maps in exactly the reverse order.
#
# encode_quotes: text [quote_char] -> encoded-text x map-ref
# 
# e.g. ($enc_text,$map)= &encode_quotes($text);
#
sub encode_quotes {
  # based on a portion of pipeline.pl, by Jim Hoagland
  $_= shift;
  my($qchar)= shift;
  $qchar= '"' unless defined($qchar); # \" is the default
  my($echar1,$echar2); # characters to encode with

  my $i=255;   # look for two safe character
  while (1) {
    $echar1= chr($i);
    $i--;
    last unless /$echar1/;
  }
  while (1) {
    $echar2= chr($i);
    last unless /$echar2/;
    $i--;
  }
    
  my(@qte)= ();
  my $count=0;

  #encode backslashes
  while (s/\\(.)/$echar1$count$echar1/) {
    $qte[$count++]= $1;
  }
  
  # encode quotes
  while (s/$qchar([^$qchar]*)$qchar/$echar2$count$echar2/) {
    $qte[$count++]= $1;
  }
  return ($_,[$qchar,$echar1,$echar2,@qte]);
}

# restore_quotes restore the backquotes and double quotes (or whatever
# the previous quoting char was) that were encoded by encode_quotes().
# Includes the quotes.
#
# restore_quotes: encoded-text x map-ref -> text
# 
# e.g. $text= &restore_quotes($enc_text,$map);
#
sub restore_quotes {
  # based on a portion of pipeline.pl, by Jim Hoagland
  $_= shift;
  my(@qte)= @{$_[0]};
  my($qchar,$echar1,$echar2,@qte)= @qte;

  s/$echar2(\d+)$echar2/$qchar$qte[$1]$qchar/g; #restore items in quotes
  s/$echar1(\d+)$echar1/\\$qte[$1]/g; #restore backslashed items
  return $_;
}

# restore_without_quotes restore the backquoted and double quoted (or
# whatever the previous quoting char was) items that were encoded by
# encode_quotes(), but without the "" or \
#
# restore_without_quotes: encoded-text x map-ref -> text
# 
# e.g. $text= &restore_quotes($enc_text,$map);
#
sub restore_without_quotes {
  # based on a portion of pipeline.pl, by Jim Hoagland
  $_= shift;
  my(@qte)= @{$_[0]};
  my($qchar,$echar1,$echar2,@qte)= @qte;

  s/$echar2(\d+)$echar2/$qte[$1]/g; #restore items in quotes
  s/$echar1(\d+)$echar1/$qte[$1]/g; #restore backslashed items
  return $_;
}


###############

# hide_words takes some given words and reduces them to some
# non-printable characters surrounding a number, which is an index
# into a mapping.  The resulting text and the mapping are returned.
# The words must start and end at a word boundary to be hidden.
#
# hide_words: text {word} -> encoded-text x map-ref
# 
# e.g. ($enc_text,$map)= &hide_words($text,'eq','lt','gt');
#
sub hide_words {
  # based on a portion of pipeline.pl, by Jim Hoagland
  $_= shift;
  my(@words)= @_;

  my(@qte)= ();
  my $count=0;

  # encode words
  foreach $word (@words) {
    while (s/\b($word)\b/\xf2$count\xf2/) {
      $qte[$count++]= $1;
    }
  }
  return ($_,[@qte]);
}

# restore_words restore the words that were encoded by hide_words()
# given the text and map.
#
# restore_words: encoded-text x map-ref -> text
# 
# e.g. $text= &restore_words($enc_text,$map);
#
sub restore_words {
  # based on a portion of pipeline.pl, by Jim Hoagland
  $_= shift;
  my(@qte)= @{$_[0]};

  s/\xf2(\d+)\xf2/$qte[$1]/g; #restore words
  return $_;
}

##########

# double_quote puts "'s around the argument list, converting existing
# \'s to \\'s and "'s to \"'s, returning the result
#
# double_quote: text -> text
#
# i.e. $qtext= &double_quote($text);
#
sub double_quote {
  $_= shift;
  return '""' unless defined($_);
  s/\\/\\\\/g;
  s/"/\\"/g;
  return "\"$_\"";
}

# not extensively tested
# # backquote_chars places a backquote in front of the indicated
# # characters in the given string and returns the result
# #
# # backquote_chars: text x chars -> text
# #
# # i.e. $qtext= &backquote_chars($text,'";,{}[]');
# #
# sub backquote_chars {
#   $_= shift;
#   my($chars)= quotemeta(shift);
#   my($bq_in_chars)= ($chars =~ s/\\\\//);
#   s/\\/\xFF/g;  # hide existing backquotes
#   s/^([$chars])/\\$1/;
#   s/([^\\])([$chars])/$1\\$2/g;
#   if ($bq_in_chars) {
#     s/\xFF/\\\\/g;
#   } else {
#     s/\xFF/\\/g;
#   }
#   return $_;
# }

#######

# encode_newline converts a newline in the text into a \n in the
# returned value
#
# encode_newline: text -> text
#
# e.g. $text= &encode_newline($text);
#
sub encode_newline {
  $_= shift;
  s/\n/\\n/g;
  return $_;
}

######

# top_level_pieces returns the pieces of the passed in string that are
# separated by a comma that are not inside (possibly nested) '()',
# '{}', or '[]'.  If a second argument is given, it is used instead of a comma as the separator.  This argument is a perl regular expression.
#
# top_level_pieces: text {x regex_sep} -> { text }
#
# e.g. @pieces= &top_level_pieces($text)
# e.g. @pieces= &top_level_pieces($text,';')
# e.g. @pieces= &top_level_pieces($text,'(;|,)')
#
sub top_level_pieces {
  my($text)= shift;
  my($sep)= shift; # regular expression that is the seperator
  $sep= ',' unless defined($sep);
  my(@chunks)= ();
  my($depth)= 0;
  my($piece)= '';
  my($open,$close)= ( quotemeta('([{') , quotemeta('])}') );
  while (1) {
    if ($depth <= 0 && $text =~ s/^([^$open$close]*?)($sep)//) { # next is a seperator
      push(@chunks,$piece.$1);
      $piece= '';
      $depth=0;
    } elsif ($text =~ s/^([^$close]*?[$open])//) { # next is '(' or '[' or '{'
      $piece.= $1;
      $depth++;
    } elsif ($text =~ s/^([^$open]*?[$close])//) { # next is ')' or '}' or ']'
      $piece.= $1;
      $depth--;
    } else { # end of string; hope depth is 0
      return (@chunks,$piece.$text);
    }
  }
}



1;
