%%html
<script src="https://bits.csb.pitt.edu/preamble.js"></script>


frame = 0
while frame < length:
    if values[frame] < float(sys.argv[3]):
        pass #do something
    frame += 1


cutoff = float(sys.argv[3])
for value in values:
    if value < cutoff:
        pass #dostuff


cnt = 0
for i in range(len(array)):
    if array[i] < cutoff:
        cnt += 1
cnt

3


np.count_nonzero(array < cutoff)

3


for i in list(range(3)):
    pass


for i in range(3):
    pass


L = [1,2,3]
for i in range(10):
    if i in set(L):
        pass


L = set([1,2,3])
for i in range(10):
    if i in L:
        pass


import re
regex = re.compile('abc')
regex

re.compile(r'abc', re.UNICODE)


regex = re.compile('abc')
regex.search('xyzabc')

<re.Match object; span=(3, 6), match='abc'>


print(regex.match('xyzabc')) #matches at beginning of line only

None


regex = re.compile('(abc)def')
match = regex.search('xyzabcdef')


match.groups()

('abc',)


match.group(1)

'abc'


match.group(0) #group zero is always the whole match

'abcdef'


regex = re.compile('(a(b(c)))def')
match = regex.search('xyzabcdefg')


%%html
<div id="regroups" style="width: 500px"></div>
<script>

    var divid = '#regroups';
	jQuery(divid).asker({
	    id: divid,
	    question: "How many groups are in match?",
		answers: ['0','1','2','3','4'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>


match.groups()

('abc', 'bc', 'c')


match.group(0)

'abcdef'


re.search('abc','abcxyz') # this searches the string 'abcxyz' using the regex 'abc'

<re.Match object; span=(0, 3), match='abc'>


regex = re.compile('abc')
regex.search('abcxyz') #same as above

<re.Match object; span=(0, 3), match='abc'>


%%html
<div id="reslashes" style="width: 500px"></div>
<script>

    var divid = '#reslashes';
	jQuery(divid).asker({
	    id: divid,
	    question: "How would you write a regular expression to match \\x\\?",
		answers: ['\\x\\','\\\\x\\\\','\\\\\\x\\\\\\','\\\\\\\\x\\\\\\\\'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>


firsttry = '\x\'

  File "/var/folders/c_/pwm7n7_174724g8zkkqlpr3m0000gn/T/ipykernel_58822/673471912.py", line 1
    firsttry = '\x\'
                    ^
SyntaxError: EOL while scanning string literal


secondtry = '\\x\\'


print(secondtry,len(secondtry))

\x\ 3


regex = re.compile(secondtry)

---------------------------------------------------------------------------
error                                     Traceback (most recent call last)
/var/folders/c_/pwm7n7_174724g8zkkqlpr3m0000gn/T/ipykernel_58822/3245247240.py in <module>
----> 1 regex = re.compile(secondtry)

/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/re.py in compile(pattern, flags)
    250 def compile(pattern, flags=0):
    251     "Compile a regular expression pattern, returning a Pattern object."
--> 252     return _compile(pattern, flags)
    253 
    254 def purge():

/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/re.py in _compile(pattern, flags)
    302     if not sre_compile.isstring(pattern):
    303         raise TypeError("first argument must be string or compiled pattern")
--> 304     p = sre_compile.compile(pattern, flags)
    305     if not (flags & DEBUG):
    306         if len(_cache) >= _MAXCACHE:

/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_compile.py in compile(p, flags)
    762     if isstring(p):
    763         pattern = p
--> 764         p = sre_parse.parse(p, flags)
    765     else:
    766         pattern = None

/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py in parse(str, flags, state)
    946 
    947     try:
--> 948         p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
    949     except Verbose:
    950         # the VERBOSE flag was switched on inside the pattern.  to be

/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py in _parse_sub(source, state, verbose, nested)
    441     start = source.tell()
    442     while True:
--> 443         itemsappend(_parse(source, state, verbose, nested + 1,
    444                            not nested and not items))
    445         if not sourcematch("|"):

/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py in _parse(source, state, verbose, nested, first)
    509         if this in "|)":
    510             break # end of subpattern
--> 511         sourceget()
    512 
    513         if verbose:

/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py in get(self)
    254     def get(self):
    255         this = self.next
--> 256         self.__next()
    257         return this
    258     def getwhile(self, n, charset):

/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py in __next(self)
    243                 char += self.decoded_string[index]
    244             except IndexError:
--> 245                 raise error("bad escape (end of pattern)",
    246                             self.string, len(self.string) - 1) from None
    247         self.index = index + 1

error: bad escape (end of pattern) at position 2


thirdtry = '\\\\x\\\\'


print(thirdtry,len(thirdtry))

\\x\\ 5


regex = re.compile(thirdtry)


regex.search('\\x\\')

<re.Match object; span=(0, 3), match='\\x\\'>


normal_str = '\\x\\'
raw_str = r'\\x\\'


print(normal_str,raw_str)

\x\ \\x\\


print(r'\x\')

  File "/var/folders/c_/pwm7n7_174724g8zkkqlpr3m0000gn/T/ipykernel_58822/2066912133.py", line 1
    print(r'\x\')
                 ^
SyntaxError: EOL while scanning string literal


bool(re.search(r'a|b','xxxaxxx'))

True


bool(re.search(r'abc|xyz','axbycz'))

False


bool(re.search(r'abc|xyz','xxxyzxxx'))

True


bool(re.search(r'a*','xxxxx'))

True


bool(re.search(r'a+','xxxxx'))

False


m = re.search(r'a+(.*)','aaba')


%%html
<div id="regreedy" style="width: 500px"></div>
<script>

    var divid = '#regreedy';
	jQuery(divid).asker({
	    id: divid,
	    question: "What is m.group(1)?",
		answers: ['a','b','ba','ab','aba','aaba'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>


m.groups()

('ba',)


m1 = re.search(r'a*(.*)','aaba')
m2 = re.search(r'a+(.*)','aaba')


m1.groups(),m2.groups()

(('ba',), ('ba',))


m3 = re.search(r'a*?(.*)','aaba')


m3 = re.search(r'a*?(.*)','aaba')


%%html
<div id="remstar3" style="width: 500px"></div>
<script>

    var divid = '#remstar3';
	jQuery(divid).asker({
	    id: divid,
	    question: "What is m3.group(1)?",
		answers: ['aaba','aba','ba','ABBA'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>


m3.groups()

('aaba',)


m = re.search(r'a+?(.*)','aaba')


%%html
<div id="rem1" style="width: 500px"></div>
<script>

    var divid = '#rem1';
	jQuery(divid).asker({
	    id: divid,
	    question: "What is m.group(1)?",
		answers: ['aaba','aba','ba','ABBA'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>


m.group(1)

'aba'


bool(re.search('^abc','xyzabc'))

False


bool(re.search('abc$','xyzabc'))

True


m = re.search(r'([0-9])','BST3')


m.groups()

('3',)


m = re.search(r'([cat])','garfield')


%%html
<div id="reset" style="width: 500px"></div>
<script>

    var divid = '#reset';
	jQuery(divid).asker({
	    id: divid,
	    question: "What's in m.group(1)?",
		answers: ['Nothing','cat','c','a','t','garfield'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>


m.group(1)

'a'


r3 = re.compile(r'([^ ]*)')
m3 = r3.search('Hello World')
m3.groups()

('Hello',)


re.search(r'(\w+)-(\w+)','de-hyphen').groups()

('de', 'hyphen')


float_regex = re.compile(r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?')


float_regex.match('3.14159')

<re.Match object; span=(0, 7), match='3.14159'>


r = re.compile(r'\d?\d.(png|jpg)')


%%html
<div id="reex1" style="width: 500px"></div>
<script>

    var divid = '#reex1';
	jQuery(divid).asker({
	    id: divid,
	    question: "Which string will NOT match",
		answers: ['0.png','15.jpg','93png','100.jpg'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>


m = re.search(r'(\w*)@pitt\.edu','dkoes@pitt.edu')
m.group(1)

'dkoes'


regex = re.compile(r'(\w+)\s+\1')
m1 = regex.search('cat cat')
m2 = regex.search('cat dog')
m1.groups(),m2

(('cat',), None)


regex = re.compile(r'(?P<last>\w+), (?P<first>\w+)')
m = regex.search('Koes, David')
print(m.group('first'),m.group('last'))

David Koes


regex = re.compile(r'(?P<animal>\w+)\s+(?P=animal)')
m1 = regex.search('cat cat')
m1.groups()

('cat',)


print(re.search(r'^cat$','cat\ndog'))

None


regex = re.compile(r'^cat$',re.MULTILINE)
regex.search('cat\ndog')

<re.Match object; span=(0, 3), match='cat'>


re.split(r'\s+',"A bunch of   spacey\nwords.")

['A', 'bunch', 'of', 'spacey', 'words.']


re.split(r'(\s+)',"A bunch of   spacey\nwords.")

['A', ' ', 'bunch', ' ', 'of', '   ', 'spacey', '\n', 'words.']


bigstr = 'abc xyz abc a x'
re.findall('abc',bigstr)

['abc', 'abc']


re.findall(r'(a)bc',bigstr)

['a', 'a']


re.findall(r'(a)b(c)',bigstr)

[('a', 'c'), ('a', 'c')]


matches = re.findall(r'(\S+)|(\S+)','x|y a|b')


%%html
<div id="reexfindall" style="width: 500px"></div>
<script>
    var divid = '#reexfindall';
	jQuery(divid).asker({
	    id: divid,
	    question: "What is in matches[0]",
		answers: ["('x','y')","('x|y')","'x|y'","('x|y','')",'Error'],
        server: "https://bits.csb.pitt.edu/asker.js/example/asker.cgi",
		charter: chartmaker})
    
$(".jp-InputArea .o:contains(html)").closest('.jp-InputArea').hide();


</script>


matches

[('x|y', ''), ('a|b', '')]


list_of_names = 'Koes, David\nKarplus, Martin\nLevitt, Michael\nWarshel, Arieh\n'


for m in re.finditer(r'(?P<last>\w+), (?P<first>\w+)',list_of_names):
    print(m.group('first'),m.group('last'))

David Koes
Martin Karplus
Michael Levitt
Arieh Warshel


pdb = '''ATOM   2267  N   THR A 609       4.155  42.962  60.898  1.00  9.19           N  
ATOM   2268  CA  THR A 609       3.520  44.246  60.575  1.00 10.78           C  
ATOM   2269  C   THR A 609       4.491  45.117  59.815  1.00 11.13           C  
ATOM   2270  O   THR A 609       5.689  44.864  59.853  1.00  9.92           O'''

print(re.sub(r' A ',' B ',pdb))

ATOM   2267  N   THR B 609       4.155  42.962  60.898  1.00  9.19           N  
ATOM   2268  CA  THR B 609       3.520  44.246  60.575  1.00 10.78           C  
ATOM   2269  C   THR B 609       4.491  45.117  59.815  1.00 11.13           C  
ATOM   2270  O   THR B 609       5.689  44.864  59.853  1.00  9.92           O


!wget http://mscbio2025.csb.pitt.edu/files/alignment.txt

--2023-10-18 21:56:26--  http://mscbio2025.csb.pitt.edu/files/alignment.txt
Resolving mscbio2025.csb.pitt.edu (mscbio2025.csb.pitt.edu)... 136.142.4.139
Connecting to mscbio2025.csb.pitt.edu (mscbio2025.csb.pitt.edu)|136.142.4.139|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 86458 (84K) [text/plain]
Saving to: ‘alignment.txt’

alignment.txt       100%[===================>]  84.43K  --.-KB/s    in 0.02s   

2023-10-18 21:56:27 (4.52 MB/s) - ‘alignment.txt’ saved [86458/86458]


!head alignment.txt

BLASTP 2.2.28+
Reference: Stephen F. Altschul, Thomas L. Madden, Alejandro
A. Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and
David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new
generation of protein database search programs", Nucleic
Acids Res. 25:3389-3402.


Reference for compositional score matrix adjustment: Stephen
F. Altschul, John C. Wootton, E. Michael Gertz, Richa


data = open('alignment.txt').read()

regex*	Match regex zero or more times (Kleene star)
regex?	Match regex one or zero times
regex+	Match regex one or more times
regex{m}	Match regex `m` times
regex{m,n}	Match regex between `m` and `n` times (as many as possible)

Regular Expressions¶

10/19/2023¶

Anti-Patterns¶

`re`¶

Matching vs Searching¶

Extracting¶

Groups¶

Using Regular Expressions¶

Regular Expression Syntax¶

The Backslash Problem¶

Raw Strings¶

Operators¶

Operators: multiple matches¶

Non-greedy Kleene¶

Special Characters¶

Character Sets¶

Character Set Complements¶

Predefined Character Sets¶

Groups¶

Named Groups¶

Compiling Regular Expressions¶

More Regular Expression Functions¶

`split`¶

`findall`¶

`finditer`¶

`sub`¶

Some Theory¶

Regular Languages¶

Finite Automata¶

Exercise¶

Answer these questions using regular expressions¶

Regular Expressions¶

10/19/2023¶

Anti-Patterns¶

re¶

Matching vs Searching¶

Extracting¶

Groups¶

Using Regular Expressions¶

Regular Expression Syntax¶

The Backslash Problem¶

Raw Strings¶

Operators¶

Operators: multiple matches¶

Non-greedy Kleene¶

Special Characters¶

Character Sets¶

Character Set Complements¶

Predefined Character Sets¶

Groups¶

Named Groups¶

Compiling Regular Expressions¶

More Regular Expression Functions¶

split¶

findall¶

finditer¶

sub¶

Some Theory¶

Regular Languages¶

Finite Automata¶

Exercise¶

Answer these questions using regular expressions¶

`re`¶

`split`¶

`findall`¶

`finditer`¶

`sub`¶