;;;; regexp.test ---  test Guile's regexps   -*- coding: utf-8; mode: scheme -*-
;;;; Jim Blandy <jimb@red-bean.com> --- September 1999
;;;;
;;;; 	Copyright (C) 1999, 2004, 2006, 2007, 2008, 2009, 2010,
;;;;      2012, 2013, 2014 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;; 
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;;; Lesser General Public License for more details.
;;;; 
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

(define-module (test-suite test-regexp)
  #:use-module (test-suite lib)
  #:use-module (srfi srfi-1)
  #:use-module (ice-9 regex))

(when (defined? 'setlocale)
  (setlocale LC_ALL "C"))

;; Don't fail if we can't display a test name to stdout/stderr.
(set-port-conversion-strategy! (current-output-port) 'escape)
(set-port-conversion-strategy! (current-error-port) 'escape)


;;; Run a regexp-substitute or regexp-substitute/global test, once
;;; providing a real port and once providing #f, requesting direct
;;; string output.
(define (vary-port func expected . args)
  (pass-if "port is string port"
	   (equal? expected
		   (call-with-output-string
		    (lambda (port)
		      (apply func port args)))))
  (pass-if "port is #f"
	   (equal? expected
		   (apply func #f args))))

(define (object->string obj)
  (call-with-output-string
   (lambda (port)
     (write obj port))))

;;;
;;; make-regexp
;;;

(with-test-prefix "make-regexp"

  (pass-if-exception "no args" exception:wrong-num-args
    (make-regexp))

  (pass-if-exception "bad pat arg" exception:wrong-type-arg
    (make-regexp 'blah))

  ;; in guile prior to 1.6.5 make-regex didn't validate its flags args
  (pass-if-exception "bad arg 2" exception:wrong-type-arg
    (make-regexp "xyz" 'abc))

  (pass-if-exception "bad arg 3" exception:wrong-type-arg
    (make-regexp "xyz" regexp/icase 'abc)))

;;;
;;; match:string
;;;

(with-test-prefix "match:string"

  (pass-if "foo"
    (string=? "foo" (match:string (string-match ".*" "foo"))))

  (pass-if "foo offset 1"
    (string=? "foo" (match:string (string-match ".*" "foo" 1)))))

;;;
;;; regexp-exec
;;;

(with-test-prefix "regexp-exec"

  (pass-if-exception "non-integer offset" exception:wrong-type-arg
    (let ((re (make-regexp "ab+")))
      (regexp-exec re "aaaabbbb" 1.5 'bogus-flags-arg)))

  (pass-if-exception "non-string input" exception:wrong-type-arg
    (let ((re (make-regexp "ab+")))
      (regexp-exec re 'not-a-string)))

  (pass-if-exception "non-string input, with offset" exception:wrong-type-arg
    (let ((re (make-regexp "ab+")))
      (regexp-exec re 'not-a-string 5)))

  ;; in guile 1.8.1 and earlier, a #\nul character in the input string was
  ;; only detected in a critical section, and the resulting error throw
  ;; abort()ed the program
  (pass-if-exception "nul in input" exception:string-contains-nul
    (let ((re (make-regexp "ab+")))
      (regexp-exec re (string #\a #\b (integer->char 0)))))

  ;; in guile 1.8.1 and earlier, a bogus flags argument was only detected
  ;; inside a critical section, and the resulting error throw abort()ed the
  ;; program
  (pass-if-exception "non-integer flags" exception:wrong-type-arg
    (let ((re (make-regexp "ab+")))
      (regexp-exec re "aaaabbbb" 0 'bogus-flags-arg))))      

;;;
;;; fold-matches
;;;

(with-test-prefix "fold-matches"

  (pass-if "without flags"
    (equal? '("hello")
            (fold-matches "^[a-z]+$" "hello" '()
                          (lambda (match result)
                            (cons (match:substring match)
                                  result)))))

  (pass-if "with flags"
    ;; Prior to 1.8.6, passing an additional flag would not work.
    (null?
     (fold-matches "^[a-z]+$" "hello" '()
                   (lambda (match result)
                     (cons (match:substring match)
                           result))
                   (logior regexp/notbol regexp/noteol))))

  (pass-if "regexp/notbol is set correctly"
    (equal? '("foo")
            (fold-matches "^foo" "foofoofoofoo" '()
                          (lambda (match result)
                            (cons (match:substring match)
                                  result))))))


;;;
;;; regexp-quote
;;;

(define-syntax with-ascii-or-latin1-locale
  (syntax-rules ()
    ((_ chr body ...)
     (if (> chr 127)
         (with-latin1-locale body ...)
         (begin body ...)))))

;; Since `regexp-quote' uses string ports, and since it is used below
;; with non-ASCII characters, these ports must be Unicode-capable.
(define-syntax with-unicode
  (syntax-rules ()
    ((_ exp)
     (with-fluids ((%default-port-encoding "UTF-8"))
       exp))))

(with-test-prefix "regexp-quote"

  (pass-if-exception "no args" exception:wrong-num-args
    (regexp-quote))

  (pass-if-exception "bad string arg" exception:wrong-type-arg
    (regexp-quote 'blah))

  (let ((lst `((regexp/basic    ,regexp/basic)
	       (regexp/extended ,regexp/extended)))
	;; String of all latin-1 characters, except #\nul which doesn't
	;; work because it's the usual end-of-string for the underlying
	;; C regexec().
	(allchars (list->string (map integer->char (cdr (iota 256))))))
    (for-each
     (lambda (elem)
       (let ((name (car  elem))
	     (flag (cadr elem)))

	 (with-test-prefix name

	   ;; Try on each individual latin-1 character, except #\nul.
	   (do ((i 1 (1+ i)))
	       ((>= i 256))
             (let* ((c (integer->char i))
                    (s (string c)))
               (pass-if (list "char" i (format #f "~s ~s" c s))
                 (with-ascii-or-latin1-locale i
                  (let* ((q (with-unicode (regexp-quote s)))
                         (m (regexp-exec (make-regexp q flag) s)))
                    (and (= 0 (match:start m))
                         (= 1 (match:end m))))))))

	   ;; Try on pattern "aX" where X is each latin-1 character,
	   ;; except #\nul.  This exposes things like "?" which are
	   ;; special only when they follow a pattern to repeat or
	   ;; whatever ("a" in this case).
	   (do ((i 1 (1+ i)))
	       ((>= i 256))
             (let* ((c (integer->char i))
                    (s (string #\a c))
                    (q (with-unicode (regexp-quote s))))
               (pass-if (list "string \"aX\"" i (format #f "~s ~s ~s" c s q))
                 (with-ascii-or-latin1-locale i
		  (let* ((m (regexp-exec (make-regexp q flag) s)))
                    (and (= 0 (match:start m))
                         (= 2 (match:end m))))))))

	   (pass-if "string of all chars"
             (with-latin1-locale
               (let ((m (regexp-exec (make-regexp (with-unicode
                                                   (regexp-quote allchars))
                                                  flag) allchars)))
                 (and (= 0 (match:start m))
                      (= (string-length allchars) (match:end m)))))))))
     lst)))

;;;
;;; regexp-substitute
;;;

(with-test-prefix "regexp-substitute"
  (let ((match
	 (string-match "patleft(sub1)patmid(sub2)patright"
		       "contleftpatleftsub1patmidsub2patrightcontright")))
    (define (try expected . args)
      (with-test-prefix (object->string args)
	(apply vary-port regexp-substitute expected match args)))

    (try "")
    (try "string1" "string1")
    (try "string1string2" "string1" "string2")
    (try "patleftsub1patmidsub2patright" 0)
    (try "hi-patleftsub1patmidsub2patright-bye" "hi-" 0 "-bye")
    (try "sub1" 1)
    (try "hi-sub1-bye" "hi-" 1 "-bye")
    (try "hi-sub2-bye" "hi-" 2 "-bye")
    (try "contleft" 'pre)
    (try "contright" 'post)
    (try "contrightcontleft" 'post 'pre)
    (try "contrightcontleftcontrightcontleft" 'post 'pre 'post 'pre)
    (try "contrightsub2sub1contleft" 'post 2 1 'pre)
    (try "foosub1sub1sub1sub1bar" "foo" 1 1 1 1 "bar")))

(with-test-prefix "regexp-substitute/global"
  
  (define (try expected . args)
    (with-test-prefix (object->string args)
      (apply vary-port regexp-substitute/global expected args)))

  (try "hi" "a(x*)b" "ab" "hi")
  (try ""   "a(x*)b" "ab" 1)
  (try "xx" "a(x*)b" "axxb" 1)
  (try "xx" "a(x*)b" "_axxb_" 1)
  (try "pre" "a(x*)b" "preaxxbpost" 'pre)
  (try "post" "a(x*)b" "preaxxbpost" 'post)
  (try "string" "x" "string" 'pre "y" 'post)
  (try "4" "a(x*)b" "_axxb_" (lambda (m)
				(number->string (match:end m 1))))

  (try "_aybycyd_" "x+" "_axbxxcxxxd_" 'pre "y" 'post)

  ;; This should not go into an infinite loop, just because the regexp
  ;; can match the empty string.  This test also kind of beats on our
  ;; definition of where a null string can match.
  (try "y_yaybycydy_y" "x*" "_axbxxcxxxd_" 'pre "y" 'post)

  ;; These kind of bother me.  The extension from regexp-substitute to
  ;; regexp-substitute/global is only natural if your item list
  ;; includes both pre and post.  If those are required, why bother
  ;; to include them at all?
  (try "4:7:12:_" "a(x*)b" "_axxbaxbaxxxb_"
       (lambda (m) (number->string (match:end m 1))) ":"
       'post)
  (try "4:10:19:_:19:10:4" "a(x*)b" "_axxbaxxxxbaxxxxxxxb_"
       (lambda (m) (number->string (match:end m 1))) ":"
       'post
       ":" (lambda (m) (number->string (match:end m 1))))

  ;; Jan Nieuwenhuizen's bug, 2 Sep 1999
  (try "" "_" (make-string 500 #\_)
       'post))

(with-test-prefix "nonascii locales"
  (pass-if "match structures refer to char offsets"
    (with-locale "en_US.utf8"
      ;; bug #31650
      (equal? (match:substring (string-match ".*" "calçot") 0)
              "calçot")))

  (pass-if "match structures refer to char offsets, non-ASCII pattern"
    (with-locale "en_US.utf8"
      ;; bug #31650
      (equal? (match:substring (string-match "λ: The Ultimate (.*)"
                                             "λ: The Ultimate GOTO")
                               1)
              "GOTO"))))
